In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import os
import glob
from datetime import datetime

# Smoking

• Per-capita cigarette consumption (in packs). Source: Orzechowski and Walker (2005). These data are based on the total tax
paid on sales of packs of cigarettes in a particular state divided
by its total population.

• Average retail price per pack of cigarettes (in cents). Source:
Orzechowski and Walker (2005). Price figures include state sales
taxes, if applicable.

• Per-capita state personal income (logged). Source: Bureau of the
Census, United States Statistical Abstract. Converted to 1997 dollars using the Consumer Price Index.

• State population and percent of state population aged 15–24.
Source: U.S. Census Bureau.

• Per-capita beer consumption. Source: Beer Institute’s Brewer’s
Almanac. Measured as the per capita consumption of malt beverages (in gallons)

In [3]:
url = 'https://raw.githubusercontent.com/OscarEngelbrektson/SyntheticControlMethods/master/examples/datasets/smoking_data.csv'
df = pd.read_csv(url)
# scaler = MinMaxScaler()
# scaler.fit(df.iloc[:, 2:])
# features = scaler.transform(df.iloc[:, 2:])
# df.iloc[:, 2:] = features

In [7]:
states = df.state.unique()
print(states)

['Alabama' 'Arkansas' 'California' 'Colorado' 'Connecticut' 'Delaware'
 'Georgia' 'Idaho' 'Illinois' 'Indiana' 'Iowa' 'Kansas' 'Kentucky'
 'Louisiana' 'Maine' 'Minnesota' 'Mississippi' 'Missouri' 'Montana'
 'Nebraska' 'Nevada' 'New Hampshire' 'New Mexico' 'North Carolina'
 'North Dakota' 'Ohio' 'Oklahoma' 'Pennsylvania' 'Rhode Island'
 'South Carolina' 'South Dakota' 'Tennessee' 'Texas' 'Utah' 'Vermont'
 'Virginia' 'West Virginia' 'Wisconsin' 'Wyoming']


AttributeError: 'numpy.ndarray' object has no attribute 'pop'

In [3]:
n_state = len(df.state.unique()) #number of state IDs
n_time = len(df.year.unique()) #number of time steps
n_state, n_time

(39, 31)

In [4]:
# [state, year, feature value]
df = df.sort_values(by=['state', 'year'])
mask_df = np.array(df.isna())
df = df.fillna(0)

arr_smoking = df.values.reshape(n_state, n_time, df.shape[1])[:, :, 2:] #remove state and year from features
mask_smoking = mask_df.reshape(n_state, n_time, df.shape[1])[:, :, 2:] #remove state and year from features

np.save('prop99/data.npy', arr_smoking)
np.save('prop99/mask.npy', mask_smoking)


In [None]:
u, s, v = np.linalg.svd(arr_smoking.astype(float))
arr_smoking.shape, '', u.shape, s.shape, v.shape

# German Reunification 
- GDP per Capita (PPP, 2002 USD).
- Investment Rate: Ratio of real domestic investment (private plus public) to real GDP. The data are reported in five-year averages.
- Schooling: Percentage of secondary school attained in the total population aged 25 and older. The data are reported in five-year increments.
- Industry: industry share of value added.
- Inflation: annual percentage change in consumer prices (base year 1995).
- Trade Openness: Export plus imports as percentage of GDP.

In [None]:
url = 'https://raw.githubusercontent.com/OscarEngelbrektson/SyntheticControlMethods/master/examples/datasets/german_reunification.csv'
df = pd.read_csv(url)
scaler = MinMaxScaler()
scaler.fit(df.iloc[:, 3:])
features = scaler.transform(df.iloc[:, 3:])
df.iloc[:, 3:] = features

n_state = len(df.country.unique()) #number of units
n_time = len(df.year.unique()) #number of time steps
n_state, n_time

In [None]:
# [state, year, feature value]
df = df.sort_values(by=['country', 'year'])
mask_df = np.array(df.isna())
df = df.fillna(0)

arr_german = df.values.reshape(n_state, n_time, df.shape[1])[:, :, 3:] #remove code, country, and year from features
mask_german = mask_df.reshape(n_state, n_time, df.shape[1])[:, :, 3:] #remove code, country, and year from features

np.save('germany/data.npy', arr_german)
np.save('germany/mask.npy', mask_german)

# Basque 
- Data on terrorist activity (deaths and kidnappings) are provided by the Spanish Ministry of Interior (2002). 
- Regional data on GDP, investment, population density, and sectoral production come from Fundacio´n BBV (1999). Data on human capital for different regions have been collected by Mas et al. (1998). 
- Oil prices come from the OECD statistical compendium CD-ROM. 
- Data on stock prices, firm size (market value of outstanding shares), book equity, and dividends are routinely collected by the Madrid Stock Exchange (www.bolsamadrid.es). 
- Interest rates on one-day public debt repurchase agreements and bonds come from the Bank of Spain.

In [3]:
url = 'https://raw.githubusercontent.com/OscarEngelbrektson/SyntheticControlMethods/master/examples/datasets/basque_data.csv'
df = pd.read_csv(url)
scaler = MinMaxScaler()
#scaler.fit(df.iloc[:, 4:])
#features = scaler.transform(df.iloc[:, 4:])
#df.iloc[:, 4:] = features
df = df[df['regionname'] != 'Spain (Espana)']
n_state = len(df.regionname.unique()) #number of units
n_time = len(df.year.unique()) #number of time steps
n_state, n_time

(17, 43)

In [5]:
# [state, year, feature value]
df = df.sort_values(by=['regionname', 'year'])
mask_df = np.array(df.isna())
df = df.fillna(0)

arr_basque = df.values.reshape(n_state, n_time, df.shape[1])[:, :, 4:] #remove code, country, and year from features
mask_basque = mask_df.reshape(n_state, n_time, df.shape[1])[:, :, 4:] #remove code, country, and year from features

np.save('basque_unscaled/data.npy', arr_basque)
np.save('basque_unscaled/mask.npy', mask_basque)

# Retail
From https://www.kaggle.com/c/walmart-recruiting-store-sales-forecasting

In [None]:
''' 
### extra features that are not included in mRSC paper ###

features = pd.read_csv('retail/features.csv')
stores = pd.read_csv('retail/stores.csv')
#features.shape, stores.shape, len(np.unique(features.Store))
df = features.merge(stores, how='inner', on = "Store")

scaler = MinMaxScaler()
continuous_features = ['Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2',
       'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Size']
discrete_features = ['IsHoliday', 'Type']
state = ['Store']
time = ['Date']
df = df[state + time + continuous_features + discrete_features] #state, time, continuous features, discrete features

scaler.fit(df.loc[:, df.columns.isin(continuous_features)])
features = scaler.transform(df.loc[:, df.columns.isin(continuous_features)])
df.loc[:, df.columns.isin(continuous_features)] = features

n_state = len(df.Store.unique()) #number of units
n_time = len(df.Date.unique()) #number of time steps
n_state, n_time
'''

train = pd.read_csv('retail/train.csv')
#test = pd.read_csv('retail/test.csv') # weekly sales data is nan
#df = pd.concat([train, test], join="outer", sort=True)
df = train

continuous_features = ['Weekly_Sales']
discrete_features = ['Dept']
state = ['Store']
time = ['Date']
df = df[state + time + continuous_features + discrete_features] #state, time, continuous features, discrete features
df = df.set_index(['Store', 'Dept', 'Date']).unstack(level=-2).reset_index(level=['Store', 'Date'])

scaler = MinMaxScaler()
scaler.fit(df.iloc[:, 2:])
features = scaler.transform(df.iloc[:, 2:])
df.iloc[:, 2:] = features

n_state = len(df.Store.unique()) #number of units
n_time = len(df.Date.unique()) #number of time steps
n_metrics = len(df.columns[2:])
n_state, n_time, n_metrics

In [None]:
# [state, year, feature value]
df = df.sort_values(by=['Store', 'Date'])
mask_df = np.array(df.isna())
df = df.fillna(0)

arr_retail = df.values.reshape(n_state, n_time, df.shape[1])[:, :, 2:] # remove store and date
mask_retail = mask_df.reshape(n_state, n_time, df.shape[1])[:, :, 2:] # remove store and date

np.save('retail/data.npy', arr_retail)
np.save('retail/mask.npy', mask_retail)

# Diabetes
From https://archive.ics.uci.edu/ml/datasets/diabetes

In [136]:
path = os.getcwd()
files = glob.glob(path+"\diabetes\data*")

li = []

patient_id = 1
for f in files:
    if f == path+"\diabetes\Data-Codes":
        break
    data = pd.read_csv(f, delimiter ='	', header = None, names=["date", "time", "code", "value"], engine='python')
    
    #cleaning
    data.loc[data.index[np.where(data['date'] == '06-31-1991')], 'date'] = '06-30-1991' #incorrect date
    data.loc[data.index[np.where(data['time'] == '56:35')], 'time'] = np.nan #incorrect time
    data.loc[data.index[np.where(data['time'] == '188:00')], 'time'] = np.nan #incorrect time

    ##remove values that can't be converted to floats
    for idx, row in data.iterrows():
        element = data.loc[idx,'value']
        try:
            float(element)
        except ValueError:
            data.at[idx,'value'] = np.nan
    
    data['id'] = patient_id
    li.append(data)
    patient_id += 1
        
        
df = pd.concat(li)
df['date'] = pd.to_datetime(df['date']).dt.date
df = df.drop(['time'], axis=1)

# pivot dataframe
df['value'] = df['value'].astype(float)
df = pd.pivot_table(df, index = ['id', 'date'], columns = 'code', values = 'value').reset_index()
df

date_range = pd.date_range(df.date.min(), df.date.max(), periods = (df.date.max()-df.date.min()).days+1, normalize = True)

li_final = []

for idx in df.id.unique():
    df_idx = df[df['id'] == idx]
    dic_list = []
    for date in date_range:
        if date not in df_idx.date.unique():
            dic = {'id': idx, 'date': date.date()}
            dic_list.append(dic)
            
    rows = pd.DataFrame.from_dict(dic_list)
    df_idx_ = df_idx.append(rows, ignore_index = True, sort = True)
    li_final.append(df_idx_)


df = pd.concat(li_final)
df.to_csv('diabetes/allData.csv')
df.head(10)

code,id,date,4,33,34,35,36,48,56,57,...,63,64,65,66,67,68,69,70,71,72
0,1,1991-04-21,,8.000000,13.0,,,123.0,,,...,,,,,,,,,,
1,1,1991-04-22,,6.333333,13.0,,,,,,...,,,,,,,,,,
2,1,1991-04-23,,9.000000,13.0,,,,,,...,,,,,,,,,,
3,1,1991-04-24,,6.333333,14.0,,,340.0,,,...,,,,,,,,,,
4,1,1991-04-25,,5.500000,14.0,,,288.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3876,70,1989-05-08,,1.000000,6.5,,,145.0,,,...,,,,,,,,,,
3877,70,1989-05-09,,1.000000,7.0,,,,,,...,,,,,,,,,,
3878,70,1989-05-10,,,7.0,,,,,,...,,,,,,,,,,
3879,70,1989-05-11,,,7.0,,,,,,...,,,,,,,,,,


In [122]:
scaler = MinMaxScaler()

continuous_features = df.columns[2:]
# no discrete features
state = ['id']
time = ['date']

scaler.fit(df.loc[:, df.columns.isin(continuous_features)])
features = scaler.transform(df.loc[:, df.columns.isin(continuous_features)])
df.loc[:, df.columns.isin(continuous_features)] = features

n_state = len(df.id.unique()) #number of units
n_time = len(df.date.unique()) #number of time steps
n_state, n_time

ValueError: could not convert string to float: 'm'

In [None]:
# [state, year, feature value]
df = df.sort_values(by=['id', 'date'])
mask_df = np.array(df.isna())
df = df.fillna(0)

arr_diab = df.values.reshape(n_state, n_time, df.shape[1])
mask_diab = mask_df.reshape(n_state, n_time, df.shape[1])

np.save('diabetes/data.npy', arr_diab)
np.save('diabetes/mask.npy', mask_diab)

# Asthma: CAMP Training 
The Childhood Asthma Management Program (CAMP) was a clinical trial carried out in children with asthma. The trial was designed to determine the long-term effects of 3 treatments (budesonide, nedocromil, or placebo) on pulmonary function as measured by normalized FEV1 over a 5-6.5 year period. The design of CAMP was a multicenter, masked, placebo-controlled, randomized trial. A total of 1,041 children (311 in the budesonide group, 312 in the nedocromil group and 418 in the placebo group) aged 5-12 years were enrolled between December of 1993 and September of 1995. The primary outcome of the trial was lung function as measured by the Forced Expiratory Volume at 1 second (FEV1). Secondary outcomes included: bronchial responsiveness to methacholine, need for beclomethasone due to asthma symptoms, termination of assigned treatment due to cessation of symptoms, and as Asthma morbidity (frequency and severity of asthma symptoms, frequency and magnitude of PEFR measurements less than 80% of personal best, prn use of supplemental inhaled albuterol, nocturnal awakenings, days of limited activity, and absences from school, courses of steroids). The Study also followed participants for outcomes related to mortality, long term safety, side effects, physical growth and development, psychological growth and development, individual and family functioning, and use of health care resources.

In [3]:
df = pd.read_csv('asthma/camp_teach.csv')  
#keep control (C) and ID 79 bud (A)
df = pd.concat([df[df['id'] == 79], df[df['TG'] == 'C']])

li_final = []
for idx in df.id.unique():
    df_idx = df[df['id'] == idx]
    dic_list = []
    for date in df.visitc.unique():
        if date not in df_idx.visitc.unique():
            dic = {'id': idx, 'visitc': date}
            dic_list.append(dic)
            
    rows = pd.DataFrame.from_dict(dic_list)
    df_idx_ = df_idx.append(rows, ignore_index = True, sort = True)
    li_final.append(df_idx_)
    
df = pd.concat(li_final)
df

continuous_features = ['PREFF','age_rz', 'hemog', 'PREFEV', 'PREFVC',  'PREPF', 'POSFEV', 'POSFVC',
                      'POSFF', 'POSPF', 'PREFEVPP', 'PREFVCPP', 'POSFEVPP', 'POSFVCPP', 'wbc', 'agehome']
discrete_features = ['GENDER', 'ETHNIC', 'anypet', 'woodstove', 'dehumid', 'parent_smokes', 'any_smokes']
state = ['id']
time = ['visitc'] #choose months
df = df[state + time + continuous_features] #state, time, continuous features, discrete features


scaler = MinMaxScaler()
scaler.fit(df.loc[:, df.columns.isin(continuous_features)])
features = scaler.transform(df.loc[:, df.columns.isin(continuous_features)])
df.loc[:, df.columns.isin(continuous_features)] = features

n_state = len(df.id.unique()) #number of units
n_time = len(df.visitc.unique()) #number of time steps
n_state, n_time

(276, 20)

In [3]:
df.head()

Unnamed: 0,id,visitc,PREFF,age_rz,hemog,PREFEV,PREFVC,PREPF,POSFEV,POSFVC,POSFF,POSPF,PREFEVPP,PREFVCPP,POSFEVPP,POSFVCPP,wbc,agehome
0,79,0,0.904762,0.625,0.777228,0.132773,0.112882,0.125,0.117742,0.134289,0.672131,0.090909,0.504202,0.327731,0.333333,0.299065,0.242574,0.416667
1,79,2,0.825397,0.625,,0.127731,0.119522,0.125,0.156452,0.135593,0.901639,0.077922,0.470588,0.344538,0.453704,0.28972,,
2,79,4,0.714286,0.625,,0.122689,0.132802,0.1125,0.122581,0.138201,0.672131,0.103896,0.453782,0.378151,0.324074,0.28972,,
3,79,12,0.809524,0.625,,0.132773,0.126162,0.1,0.153226,0.136897,0.868852,0.103896,0.420168,0.285714,0.361111,0.205607,,0.625
4,79,16,0.84127,0.625,,0.196639,0.179283,0.15,0.21129,0.185137,0.885246,0.123377,0.554622,0.403361,0.490741,0.327103,,


In [6]:
# [state, year, feature value]
df = df.sort_values(by=['id', 'visitc'])
mask_df = np.array(df.isna())
df = df.fillna(0)

arr_bud = df.values.reshape(n_state, n_time, df.shape[1])[:, :, 2:] #remove id and months
mask_bud = mask_df.reshape(n_state, n_time, df.shape[1])[:, :, 2:]  #remove id and months

np.save('asthma_bud/data.npy', arr_bud)
np.save('asthma_bud/mask.npy', mask_bud)

In [4]:
df = pd.read_csv('asthma/camp_teach.csv')  
#keep control (C) and ID 79 bud (A)
df =  df[df['TG'] == 'C']

li_final = []
for idx in df.id.unique():
    df_idx = df[df['id'] == idx]
    dic_list = []
    for date in df.visitc.unique():
        if date not in df_idx.visitc.unique():
            dic = {'id': idx, 'visitc': date}
            dic_list.append(dic)
            
    rows = pd.DataFrame.from_dict(dic_list)
    df_idx_ = df_idx.append(rows, ignore_index = True, sort = True)
    li_final.append(df_idx_)
    
df = pd.concat(li_final)
df

continuous_features = ['PREFF','age_rz', 'hemog', 'PREFEV', 'PREFVC',  'PREPF', 'POSFEV', 'POSFVC',
                      'POSFF', 'POSPF', 'PREFEVPP', 'PREFVCPP', 'POSFEVPP', 'POSFVCPP', 'wbc', 'agehome']
discrete_features = ['GENDER', 'ETHNIC', 'anypet', 'woodstove', 'dehumid', 'parent_smokes', 'any_smokes']
state = ['id']
time = ['visitc'] #choose months
df = df[state + time + continuous_features] #state, time, continuous features, discrete features


scaler = MinMaxScaler()
scaler.fit(df.loc[:, df.columns.isin(continuous_features)])
features = scaler.transform(df.loc[:, df.columns.isin(continuous_features)])
df.loc[:, df.columns.isin(continuous_features)] = features

n_state = len(df.id.unique()) #number of units
n_time = len(df.visitc.unique()) #number of time steps
n_state, n_time

(275, 20)

In [5]:
df.head()

Unnamed: 0,id,visitc,PREFF,age_rz,hemog,PREFEV,PREFVC,PREPF,POSFEV,POSFVC,POSFF,POSPF,PREFEVPP,PREFVCPP,POSFEVPP,POSFVCPP,wbc,agehome
0,10,0,0.634921,0.375,0.808168,0.30084,0.329349,0.3125,0.33871,0.342894,0.721311,0.298701,0.588235,0.554622,0.592593,0.523364,0.336634,0.208333
1,10,2,0.571429,0.375,,0.317647,0.36919,0.3375,0.348387,0.375489,0.639344,0.337662,0.596639,0.613445,0.583333,0.570093,,
2,10,4,0.619048,0.375,,0.357983,0.394422,0.2875,0.38871,0.395046,0.704918,0.350649,0.638655,0.621849,0.62963,0.570093,,
3,10,12,0.619048,0.375,,0.411765,0.447543,0.4375,0.446774,0.452412,0.704918,0.396104,0.621849,0.605042,0.611111,0.560748,,0.333333
4,10,16,0.68254,0.375,,0.465546,0.478088,0.4625,0.483871,0.481095,0.721311,0.454545,0.680672,0.630252,0.648148,0.579439,,


In [5]:
# [state, year, feature value]
df = df.sort_values(by=['id', 'visitc'])
mask_df = np.array(df.isna())
df = df.fillna(0)

arr_bud = df.values.reshape(n_state, n_time, df.shape[1])[:, :, 2:] #remove id and months
mask_bud = mask_df.reshape(n_state, n_time, df.shape[1])[:, :, 2:]  #remove id and months
print(arr_bud.shape,mask_bud.shape)
np.save('asthma_placebo/data.npy', arr_bud)
np.save('asthma_placebo/mask.npy', mask_bud)

(275, 20, 16) (275, 20, 16)


In [268]:
df = pd.read_csv('asthma/camp_teach.csv')  
#keep control (C) and ID 82 ned (B)
df = pd.concat([df[df['id'] == 92], df[df['TG'] == 'C']])

li_final = []
for idx in df.id.unique():
    df_idx = df[df['id'] == idx]
    dic_list = []
    for date in df.visitc.unique():
        if date not in df_idx.visitc.unique():
            dic = {'id': idx, 'visitc': date}
            dic_list.append(dic)
            
    rows = pd.DataFrame.from_dict(dic_list)
    df_idx_ = df_idx.append(rows, ignore_index = True, sort = True)
    li_final.append(df_idx_)
    
df = pd.concat(li_final)
df

continuous_features = ['age_rz', 'hemog', 'PREFEV', 'PREFVC', 'PREFF', 'PREPF', 'POSFEV', 'POSFVC',
                      'POSFF', 'POSPF', 'PREFEVPP', 'PREFVCPP', 'POSFEVPP', 'POSFVCPP', 'wbc', 'agehome']
discrete_features = ['GENDER', 'ETHNIC', 'anypet', 'woodstove', 'dehumid', 'parent_smokes', 'any_smokes']
state = ['id']
time = ['visitc'] #choose months
df = df[state + time + continuous_features + discrete_features] #state, time, continuous features, discrete features


scaler = MinMaxScaler()
scaler.fit(df.loc[:, df.columns.isin(continuous_features)])
features = scaler.transform(df.loc[:, df.columns.isin(continuous_features)])
df.loc[:, df.columns.isin(continuous_features)] = features

n_state = len(df.id.unique()) #number of units
n_time = len(df.visitc.unique()) #number of time steps
n_state, n_time

(276, 20)

In [269]:
# [state, year, feature value]
df = df.sort_values(by=['id', 'visitc'])
mask_df = np.array(df.isna())
df = df.fillna(0)

arr_ned = df.values.reshape(n_state, n_time, df.shape[1])[:, :, 2:] #remove id and months
mask_ned = mask_df.reshape(n_state, n_time, df.shape[1])[:, :, 2:]  #remove id and months

np.save('asthma_ned/data.npy', arr_ned)
np.save('asthma_ned/mask.npy', mask_ned)

In [None]:
#FA data 

df = pd.read_csv(url)