In [192]:
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.preprocessing import StandardScaler, PolynomialFeatures


from sklearn.metrics import r2_score

In [214]:
import seaborn as sns
import matplotlib.pyplot as plt

In [294]:
df=pd.read_pickle('/home/kchiv/kchiv/metis/metisgh/sf19_ds15/combined_article_metrics.pkl')

df.loc[df['num_times_cited'] == 0, 'num_times_cited'] = 1
df.loc[df['num_authors'] == 0, 'num_authors'] = 3
df.loc[df['num_institutions'] == 0, 'num_institutions'] = 1
df.loc[df['altmetric'] == 0, 'altmetric'] = 1
df.loc[df['mean_i10_index'] == 0, 'mean_i10_index'] = 1


y = np.log(df['altmetric'])
df['num_times_cited_log'] = np.log(df['num_times_cited'])
df['num_authors_log'] = np.log(df['num_authors'])
df['mean_h_index_log'] = np.log(df['mean_h_index'])
df['mean_i10_index_log'] = np.log(df['mean_i10_index'])


drop_cols = ['altmetric','fig_count', 'top50','top100', 'num_times_cited', 
             'other', 'num_institutions', 'num_authors', 'mean_h_index',
           'abstract_length', 'mean_i10_index','page_length', 'title_length', 'ref_cnt']

df = df.drop(columns = drop_cols)
X =df


X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=71) #hold out 20% of the data for final testing

#this helps with the way kf will generate indices below
X, y = np.array(X), np.array(y)

In [295]:
kf = KFold(n_splits=5, shuffle=True, random_state = 71)
cv_lm_r2s, cv_lm_reg_r2s, cv_lasso_r2s, cv_elastic_r2s, cv_poly_r2s = [], [], [], [], [] #collect the validation results for both models

for train_ind, val_ind in kf.split(X,y):
    
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind] 
    
    #simple linear regression
    lm = LinearRegression()

    lm.fit(X_train, y_train)
    cv_lm_r2s.append(lm.score(X_val, y_val))
    
    #ridge with feature scaling
    lm_reg = Ridge(alpha=10)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    lm_reg.fit(X_train_scaled, y_train)
    cv_lm_reg_r2s.append(lm_reg.score(X_val_scaled, y_val))
    
    #lasso
    lasso = Lasso(alpha = 0.01)

    lasso.fit(X_train_scaled, y_train)
    cv_lasso_r2s.append(lasso.score(X_val_scaled, y_val))
    
    #Elastic Net CV
    elastic = ElasticNet(alpha = 10, l1_ratio = 0.8)

    elastic.fit(X_train_scaled, y_train)
    cv_elastic_r2s.append(elastic.score(X_val_scaled, y_val))
    
    
    #polynomial regression
    lm_poly = LinearRegression()
    poly = PolynomialFeatures(degree=2)
    
    X_poly = poly.fit_transform(X_train)
    X_val = poly.transform(X_val)

    lm_poly.fit(X_poly, y_train)
#     y_pred = lm_poly.predict(X_poly)
    
#     cv_poly_r2s.append(r2_score(y_train, y_pred))
    cv_poly_r2s.append(lm_poly.score(X_val, y_val))
    


print('Simple regression scores: \n', cv_lm_r2s, '\n')
print('Ridge scores: \n', cv_lm_reg_r2s, '\n')
print('Lasso scores: \n', cv_lasso_r2s, '\n')
print('Elastic scores: \n', cv_lasso_r2s, '\n')
print('Poly scores: \n', cv_poly_r2s, '\n')


print(f'Simple mean cv r^2: {np.mean(cv_lm_r2s):.3f} +- {np.std(cv_lm_r2s):.3f}')
print(f'Ridge mean cv r^2: {np.mean(cv_lm_reg_r2s):.3f} +- {np.std(cv_lm_reg_r2s):.3f}')
print(f'Lasso mean cv r^2: {np.mean(cv_lasso_r2s):.3f} +- {np.std(cv_lasso_r2s):.3f}')
print(f'Elastic mean cv r^2: {np.mean(cv_lasso_r2s):.3f} +- {np.std(cv_lasso_r2s):.3f}')
print(f'Poly mean cv r^2: {np.mean(cv_poly_r2s):.3f} +- {np.std(cv_poly_r2s):.3f}')


Simple regression scores: 
 [0.5382291995178371, 0.5761023902365623, 0.43630232349250697, 0.570647743537464, 0.5531210103486601] 

Ridge scores: 
 [0.5373331702543613, 0.5737514386093089, 0.43836710408002655, 0.5704865131623713, 0.5545780502799703] 

Lasso scores: 
 [0.5417129718398761, 0.5730429586053811, 0.4362869659267191, 0.5710949742864683, 0.5545239390302655] 

Elastic scores: 
 [0.5417129718398761, 0.5730429586053811, 0.4362869659267191, 0.5710949742864683, 0.5545239390302655] 

Poly scores: 
 [0.43100333333479074, 0.09824073015194258, 0.46473224227209786, 0.5606097217246047, 0.5342620486021648] 

Simple mean cv r^2: 0.535 +- 0.051
Ridge mean cv r^2: 0.535 +- 0.050
Lasso mean cv r^2: 0.535 +- 0.051
Elastic mean cv r^2: 0.535 +- 0.051
Poly mean cv r^2: 0.418 +- 0.166


In [296]:
list(zip(df.columns, lm_reg.coef_))

[('mean_author_citations', -0.08356828968527669),
 ('top20', 0.10435534234536795),
 ('year', -1.2979305086075599),
 ('num_times_cited_log', 0.6657202617547507),
 ('num_authors_log', 0.1821045899317982),
 ('mean_h_index_log', 0.09004325562517684),
 ('mean_i10_index_log', -0.07635046707442826)]

In [297]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=71) #hold out 20% of the data for final testing

lm_reg = Ridge(alpha=100)
    
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lm_reg.fit(X_scaled, y_train)
lm_reg.score(X_test_scaled, y_test)

0.5203541123090603

In [298]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=71) #hold out 20% of the data for final testing

lm_poly = LinearRegression()
poly = PolynomialFeatures(degree=2)
    
X_poly = poly.fit_transform(X_train)
X_test_scaled = poly.transform(X_test)

lm_poly.fit(X_poly, y_train)
lm_poly.score(X_test_scaled, y_test)

0.431003332534747

In [236]:
list(zip(df.columns, lm_poly.coef_))

[('num_times_cited', -8.823107637090529e-09),
 ('mean_author_citations', -0.9527752720440794),
 ('mean_h_index', -0.00029623380030888564),
 ('mean_i10_index', 2.6079724062394383),
 ('num_authors', 0.002467310054922143),
 ('top20', 1.2627747026995295),
 ('year', -0.09979510871796438)]

In [237]:
elastic.coef_

array([ 0.,  0.,  0.,  0.,  0.,  0., -0.])

Run with non-generated values of only 80

In [211]:
#run with non-generated values of only 80
df=pd.read_pickle('/home/kchiv/kchiv/metis/metisgh/sf19_ds15/real_article_metrics.pkl')

df.loc[df['num_times_cited'] == 0, 'num_times_cited'] = 1
df.loc[df['num_authors'] == 0, 'num_authors'] = 3
df.loc[df['num_institutions'] == 0, 'num_institutions'] = 1
df.loc[df['altmetric'] == 0, 'altmetric'] = 1
df.loc[df['mean_i10_index'] == 0, 'mean_i10_index'] = 1


y = np.log(df['altmetric'])
df['num_times_cited'] = np.log(df['num_times_cited'])
df['num_authors'] = np.log(df['num_authors'])
df['mean_h_index'] = np.log(df['mean_h_index'])

drop_cols = ['altmetric','fig_count', 'top50','top100',
             'other', 'num_institutions', 'mean_author_citations',
           'abstract_length', 'page_length', 'title_length', 'mean_i10_index', 'ref_cnt']


df = df.drop(columns = drop_cols)
X =df


X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=71) #hold out 20% of the data for final testing

#this helps with the way kf will generate indices below
X, y = np.array(X), np.array(y)

In [212]:
f = KFold(n_splits=5, shuffle=True, random_state = 71)
cv_lm_r2s, cv_lm_reg_r2s, cv_poly_r2s = [], [], [] #collect the validation results for both models

for train_ind, val_ind in kf.split(X,y):
    
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind] 
    
    #simple linear regression
    lm = LinearRegression()

    lm.fit(X_train, y_train)
    cv_lm_r2s.append(lm.score(X_val, y_val))
    
    #ridge with feature scaling
    lm_reg = Ridge(alpha=100)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    lm_reg.fit(X_train_scaled, y_train)
    cv_lm_reg_r2s.append(lm_reg.score(X_val_scaled, y_val))
    
    
    #polynomial regression
    lm_poly = LinearRegression()
    poly = PolynomialFeatures(degree=2)
    
    X_poly = poly.fit_transform(X_train)
    X_val = poly.transform(X_val)

    lm_poly.fit(X_poly, y_train)
#     y_pred = lm_poly.predict(X_poly)
    
#     cv_poly_r2s.append(r2_score(y_train, y_pred))
    cv_poly_r2s.append(lm_poly.score(X_val, y_val))


print('Simple regression scores: \n', cv_lm_r2s, '\n')
print('Ridge scores: \n', cv_lm_reg_r2s, '\n')
print('Poly scores: \n', cv_poly_r2s, '\n')

print(f'Simple mean cv r^2: {np.mean(cv_lm_r2s):.3f} +- {np.std(cv_lm_r2s):.3f}')
print(f'Ridge mean cv r^2: {np.mean(cv_lm_reg_r2s):.3f} +- {np.std(cv_lm_reg_r2s):.3f}')
print(f'Poly mean cv r^2: {np.mean(cv_poly_r2s):.3f} +- {np.std(cv_poly_r2s):.3f}')

#the non-generated values perform terribly

Simple regression scores: 
 [0.5369828241547616, 0.7824714777838894, 0.687874097145421, 0.5249063710583584, -1.3968190570662569, 0.5956617315852406, 0.6703236521698307, 0.8620037642849794, 0.5706170397159102, 0.31780999464352067] 

Ridge scores: 
 [0.22639733542114837, 0.48992127503558613, 0.5753365001440827, 0.43725952092094733, -1.3565261031713542, 0.3456363127430241, 0.3545155689498865, 0.502638583101522, 0.23568511136024653, 0.3486507827840071] 

Poly scores: 
 [-0.2500903328907249, 0.15471686378608984, 0.2514500940688431, -0.4081691807119967, -1.0622028077341468, 0.314345891675308, 0.268083916704126, 0.7993817615311783, 0.46588925218236926, 0.39309978711449256] 

Simple mean cv r^2: 0.415 +- 0.621
Ridge mean cv r^2: 0.216 +- 0.535
Poly mean cv r^2: 0.093 +- 0.504
