In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Smoking

• Per-capita cigarette consumption (in packs). Source: Orzechowski and Walker (2005). These data are based on the total tax
paid on sales of packs of cigarettes in a particular state divided
by its total population.

• Average retail price per pack of cigarettes (in cents). Source:
Orzechowski and Walker (2005). Price figures include state sales
taxes, if applicable.

• Per-capita state personal income (logged). Source: Bureau of the
Census, United States Statistical Abstract. Converted to 1997 dollars using the Consumer Price Index.

• State population and percent of state population aged 15–24.
Source: U.S. Census Bureau.

• Per-capita beer consumption. Source: Beer Institute’s Brewer’s
Almanac. Measured as the per capita consumption of malt beverages (in gallons)

In [2]:
url = 'https://raw.githubusercontent.com/OscarEngelbrektson/SyntheticControlMethods/master/examples/datasets/smoking_data.csv'
df = pd.read_csv(url)
scaler = MinMaxScaler()
scaler.fit(df.iloc[:, 2:])
features = scaler.transform(df.iloc[:, 2:])
df.iloc[:, 2:] = features

In [3]:
n_state = len(df.state.unique()) #number of state IDs
n_time = len(df.year.unique()) #number of time steps
n_state, n_time

(39, 31)

In [4]:
# [state, year, feature value]
df = df.sort_values(by=['state', 'year'])
mask_df = np.array(df.isna())
df = df.fillna(0)

arr_smoking = df.values.reshape(n_state, n_time, df.shape[1])[:, :, 2:] #remove state and year from features
mask_smoking = mask_df.reshape(n_state, n_time, df.shape[1])[:, :, 2:] #remove state and year from features

np.save('smoking/smoking.npy', arr_smoking)
np.save('smoking/smoking_mask.npy', mask_smoking)


In [5]:
u, s, v = np.linalg.svd(arr_smoking.astype(float))
arr_smoking.shape, '', u.shape, s.shape, v.shape

((39, 31, 5), '', (39, 31, 31), (39, 5), (39, 5, 5))

# German Reunification 
- GDP per Capita (PPP, 2002 USD).
- Investment Rate: Ratio of real domestic investment (private plus public) to real GDP. The data are reported in five-year averages.
- Schooling: Percentage of secondary school attained in the total population aged 25 and older. The data are reported in five-year increments.
- Industry: industry share of value added.
- Inflation: annual percentage change in consumer prices (base year 1995).
- Trade Openness: Export plus imports as percentage of GDP.

In [6]:
url = 'https://raw.githubusercontent.com/OscarEngelbrektson/SyntheticControlMethods/master/examples/datasets/german_reunification.csv'
df = pd.read_csv(url)
scaler = MinMaxScaler()
scaler.fit(df.iloc[:, 3:])
features = scaler.transform(df.iloc[:, 3:])
df.iloc[:, 3:] = features

n_state = len(df.country.unique()) #number of units
n_time = len(df.year.unique()) #number of time steps
n_state, n_time

(17, 44)

In [7]:
# [state, year, feature value]
df = df.sort_values(by=['country', 'year'])
mask_df = np.array(df.isna())
df = df.fillna(0)

arr_german = df.values.reshape(n_state, n_time, df.shape[1])[:, :, 3:] #remove code, country, and year from features
mask_german = mask_df.reshape(n_state, n_time, df.shape[1])[:, :, 3:] #remove code, country, and year from features

np.save('germany/germany.npy', arr_german)
np.save('germany/germany_mask.npy', mask_german)

# Basque 
- Data on terrorist activity (deaths and kidnappings) are provided by the Spanish Ministry of Interior (2002). 
- Regional data on GDP, investment, population density, and sectoral production come from Fundacio´n BBV (1999). Data on human capital for different regions have been collected by Mas et al. (1998). 
- Oil prices come from the OECD statistical compendium CD-ROM. 
- Data on stock prices, firm size (market value of outstanding shares), book equity, and dividends are routinely collected by the Madrid Stock Exchange (www.bolsamadrid.es). 
- Interest rates on one-day public debt repurchase agreements and bonds come from the Bank of Spain.

In [8]:
url = 'https://raw.githubusercontent.com/OscarEngelbrektson/SyntheticControlMethods/master/examples/datasets/basque_data.csv'
df = pd.read_csv(url)
scaler = MinMaxScaler()
scaler.fit(df.iloc[:, 4:])
features = scaler.transform(df.iloc[:, 4:])
df.iloc[:, 4:] = features

n_state = len(df.regionname.unique()) #number of units
n_time = len(df.year.unique()) #number of time steps
n_state, n_time

(18, 43)

In [9]:
# [state, year, feature value]
df = df.sort_values(by=['regionname', 'year'])
mask_df = np.array(df.isna())
df = df.fillna(0)

arr_basque = df.values.reshape(n_state, n_time, df.shape[1])[:, :, 4:] #remove code, country, and year from features
mask_basque = mask_df.reshape(n_state, n_time, df.shape[1])[:, :, 4:] #remove code, country, and year from features

np.save('basque/basque.npy', arr_basque)
np.save('basque/basque_mask.npy', mask_basque)

# Retail
From https://www.kaggle.com/c/walmart-recruiting-store-sales-forecasting

In [10]:
features = pd.read_csv('retail/features.csv')
stores = pd.read_csv('retail/stores.csv')
#features.shape, stores.shape, len(np.unique(features.Store))
df = features.merge(stores, how='inner', on = "Store")

scaler = MinMaxScaler()
continuous_features = ['Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2',
       'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Size']
discrete_features = ['IsHoliday', 'Type']
state = ['Store']
time = ['Date']
df = df[state + time + continuous_features + discrete_features] #state, time, continuous features, discrete features

scaler.fit(df.loc[:, df.columns.isin(continuous_features)])
features = scaler.transform(df.loc[:, df.columns.isin(continuous_features)])
df.loc[:, df.columns.isin(continuous_features)] = features

n_state = len(df.Store.unique()) #number of units
n_time = len(df.Date.unique()) #number of time steps
n_state, n_time

(45, 182)

In [11]:
# [state, year, feature value]
df = df.sort_values(by=['Store', 'Date'])
mask_df = np.array(df.isna())
df = df.fillna(0)

arr_retail = df.values.reshape(n_state, n_time, df.shape[1])[:, :, 2:] # remove store and date
mask_retail = mask_df.reshape(n_state, n_time, df.shape[1])[:, :, 2:] # remove store and date

np.save('retail/retail.npy', arr_retail)
np.save('retail/retail_mask.npy', mask_retail)