In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse.csr import csr_matrix
plt.style.use('ggplot')

# Error function
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(np.mean((y_pred - y_true) ** 2))

df = pd.read_csv('../input/2018s1-mo444-assignment-01//train.csv')
df.drop('url', axis=1, inplace=True)

In [None]:
plt.figure(figsize=[5,5])
ax = sns.distplot(df['shares'])
ax.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
plt.xticks(rotation=30)
plt.show()

In [None]:
df.shares = np.log(df.shares)
#df.shares = StandardScaler().fit_transform(df.shares.values.reshape(-1,1))
plt.figure(figsize=[5,5])
sns.distplot(df.shares)
plt.show()

In [None]:
plt.figure(figsize=[15,15])

plt.subplot(331)
plt.scatter(df.shares, df.n_non_stop_unique_tokens.values.reshape(-1,1))
plt.xlabel('df.n_non_stop_unique_tokens')

plt.subplot(332)
plt.scatter(df.shares, df.n_non_stop_words.values.reshape(-1,1))
plt.xlabel('df.n_non_stop_words')

plt.show()

In [None]:
plt.figure(figsize=[15,13])
plt.subplot(331)
sns.violinplot(x='data_channel_is_lifestyle', y='shares', data=df)

plt.subplot(332)
sns.violinplot(x='data_channel_is_entertainment', y='shares', data=df)

plt.subplot(333)
sns.violinplot(x='data_channel_is_bus', y='shares', data=df)

plt.subplot(334)
sns.violinplot(x='data_channel_is_socmed', y='shares', data=df)

plt.subplot(335)
sns.violinplot(x='data_channel_is_tech', y='shares', data=df)

plt.subplot(336)
sns.violinplot(x='data_channel_is_world', y='shares', data=df)

plt.show()

In [None]:
plt.figure(figsize=[15,13])
plt.subplot(721)
sns.violinplot(x='weekday_is_monday', y='shares', data=df)

plt.subplot(331)
sns.violinplot(x='weekday_is_tuesday', y='shares', data=df)

plt.subplot(332)
sns.violinplot(x='weekday_is_wednesday', y='shares', data=df)

plt.subplot(333)
sns.violinplot(x='weekday_is_thursday', y='shares', data=df)

plt.subplot(334)
sns.violinplot(x='weekday_is_friday', y='shares', data=df)

plt.subplot(335)
sns.violinplot(x='weekday_is_saturday', y='shares', data=df)

plt.subplot(336)
sns.violinplot(x='weekday_is_sunday', y='shares', data=df)

plt.subplot(337)
sns.violinplot(x='is_weekend', y='shares', data=df)

plt.show()

In [None]:
check_outliers = ['kw_max_avg', 'self_reference_max_shares', 'self_reference_avg_sharess', 
                  'kw_avg_max', 'kw_avg_avg', 'kw_min_max', 'n_tokens_content', 'kw_avg_min',
                  'self_reference_min_shares', 'kw_max_min', 'kw_max_max', 'shares']

plt.figure(figsize=[20,10])
df[list(set(df.columns) - set(check_outliers))].boxplot()
plt.xticks(rotation=90)
plt.show()

In [None]:
#df[discrete_vars] = df[discrete_vars].apply(lambda x: x.astype('category'))
#df.info()

In [None]:
df = df.drop(['is_weekend'], axis=1)

In [None]:
df = df[df.n_non_stop_unique_tokens < 600]
df = df[df.n_unique_tokens < 500]
df = df[df.n_non_stop_words < 1000]

In [None]:
plt.figure(figsize=[15,15])

plt.subplot(331)
plt.scatter(df.shares, df.n_non_stop_unique_tokens.values.reshape(-1,1))
plt.xlabel('n_non_stop_unique_tokens')

plt.subplot(332)
plt.scatter(df.shares, df.n_non_stop_words.values.reshape(-1,1))
plt.xlabel('n_non_stop_words')

plt.show()

In [None]:
for col in df.columns:
    plt.scatter(df.shares, df[col].values)
    plt.xlabel(col)
    plt.show()

In [None]:
discrete_vars = ['n_tokens_title', 'num_keywords', 'kw_min_min', 'kw_max_min', 
                  'kw_min_max', 'kw_max_max', 'num_hrefs', 'num_self_hrefs', 
                  'num_imgs', 'num_videos', 'n_tokens_title', 'n_tokens_content', 
                  'n_non_stop_words', 'n_non_stop_words', 'data_channel_is_lifestyle', 
                  'data_channel_is_entertainment', 'data_channel_is_bus',
                  'data_channel_is_socmed', 'data_channel_is_tech', 'data_channel_is_world', 
                  'weekday_is_monday', 'weekday_is_tuesday', 'weekday_is_wednesday',
                  'weekday_is_thursday', 'weekday_is_friday', 'weekday_is_saturday',
                  'weekday_is_sunday', 'is_weekend']

other_data = list(set(df.columns) - set(discrete_vars) - set(['shares']))

In [None]:
# ML Models
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# preprocessing
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import PolynomialFeatures

# Metrics
from sklearn.metrics import mean_absolute_error

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

In [None]:
###################### XGBoost ######################

scaler = StandardScaler()

X = df.drop(['shares'], axis=1)
X = scaler.fit_transform(X)

y = df.shares


reg = XGBRegressor(learning_rate=0.25, gamma=1e-3, min_child_weight=15, 
                   max_depth=3, colsample_bylevel=.7, colsample_bytree=.7, reg_lambda=10,
                   max_delta_step=10)
reg.fit(X, y)

df_test = pd.read_csv('../input/2018s1-mo444-assignment-01//test.csv')
df_test = df_test.drop(['url'], axis=1)
X_test = scaler.transform(df_test)

df_target = pd.read_csv('../input/2018s1-mo444-assignment-01//test_target.csv')
df_target.shares = np.log(df_target['shares'].values)

y_pred = reg.predict(X_test)
print("Model xgboost - RMSE Score: {}   MAE Score: {}".format(root_mean_squared_error(df_target.shares, y_pred), mean_absolute_error(np.exp(df_target.shares), np.exp(y_pred))))

In [None]:
plt.plot(y_pred)
plt.plot(df_target.shares, alpha=.7)
plt.show()

In [None]:
###################### Random Forest ######################

scaler = StandardScaler()
X = df.drop(['shares'], axis=1)
X = scaler.fit_transform(X)

y = df.shares

reg = RandomForestRegressor(max_depth=10, min_samples_leaf=50, n_estimators=25)
reg.fit(X, y)

df_test = pd.read_csv('../input/2018s1-mo444-assignment-01//test.csv')
df_test = df_test.drop(['url'], axis=1)
X_test = scaler.transform(df_test)

df_target = pd.read_csv('../input/2018s1-mo444-assignment-01//test_target.csv')
df_target.shares = np.log(df_target['shares'].values)

y_pred = reg.predict(X_test)
print("Model random forest tree - RMSE Score: {}".format(root_mean_squared_error(df_target.shares, y_pred)))

In [None]:
X_poly = df.sample(2000)
y_poly = X_poly.shares
X_poly.drop('shares', axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X_poly, y_poly, test_size=.3, random_state=50)

degrees = [5]

for degree in degrees:
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=degree)),
        ('linear', LinearRegression())
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print("Model Linear Regression - Degree {}\nRMSE Score: {}  -  MAE Score: {}".format(degree, root_mean_squared_error(y_test, y_pred), mean_absolute_error(np.exp(y_test), np.exp(y_pred))))

In [None]:
X = df.drop(['shares'], axis=1)
y = df.shares

################################################## PIPELINE ##################################################
get_discrete_data = FunctionTransformer(lambda x: x[discrete_vars], validate=False)
get_other_data = FunctionTransformer(lambda x: x[other_data], validate=False)

process = FeatureUnion([
    ('discrete_features', Pipeline([
        ('selector', get_discrete_data),
        ('scaler', StandardScaler())
    ])),
    ('other_features', Pipeline([
        ('selector', get_other_data),
        ('scaler', StandardScaler())
    ]))
])

pipeline = Pipeline([
    ('discrete_process', process),
    ('poly', PolynomialFeatures(degree=3)),
    ('pca', PCA(n_components=10)),
    ('linear', LinearRegression())
    #('sgdregressor', SGDRegressor(alpha=0.01, max_iter=100, fit_intercept=True, penalty='l2', random_state=50))
])

pipeline.fit(X, y)

df_test = pd.read_csv('../input/2018s1-mo444-assignment-01//test.csv')
df_test = df_test.drop('url', axis=1)
#df_test = scaler.transform(df_test)

df_target = pd.read_csv('../input/2018s1-mo444-assignment-01//test_target.csv')
df_target.shares = np.log(df_target['shares'].values)

y_pred = pipeline.predict(df_test)
print("Model 1 - RMSE Score: {}".format(root_mean_squared_error(df_target.shares, y_pred)))
#########################################################################################################################

In [None]:
plt.figure(figsize=[5,5])
plt.scatter(y_pred, df_target.shares)
plt.ylabel('True shares')
plt.xlabel('Predicted shares')
plt.show()

In [None]:
XGBRegressor()

In [None]:
X = df.drop(['shares'], axis=1)
y = df.shares

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=50)

################################################## PIPELINE ##################################################
get_discrete_data = FunctionTransformer(lambda x: x[discrete_vars], validate=False)
get_other_data = FunctionTransformer(lambda x: x[other_data], validate=False)

n_iteration = [5, 10, 50, 100, 500, 1000, 2000, 3000, 4000] 
results_train = []
results_test = []

for n in n_iteration:
    process = FeatureUnion([
        ('discrete_features', Pipeline([
            ('selector', get_discrete_data),
            ('scaler', StandardScaler())
        ])),
        ('other_features', Pipeline([
            ('selector', get_other_data),
            ('scaler', StandardScaler())
        ]))
    ])
    pipeline = Pipeline([
        ('discrete_process', process),
        ('sgdregressor', SGDRegressor(alpha=0.001, max_iter=n, random_state=50))
    ])
    
    pipeline.fit(X, y)
    
    y_pred_train = pipeline.predict(X)
    results_train.append(root_mean_squared_error(y, y_pred_train))
    
    df_test = pd.read_csv('../input/2018s1-mo444-assignment-01//test.csv')
    df_test = df_test.drop(['url'], axis=1)

    df_target = pd.read_csv('../input/2018s1-mo444-assignment-01//test_target.csv')
    df_target.shares = np.log(df_target['shares'].values)

    y_pred = pipeline.predict(df_test)
    results_test.append(root_mean_squared_error(df_target.shares, y_pred))
#########################################################################################################################

In [None]:
######### plot #########
plt.figure(figsize=[5,5])
plt.plot(results_train, label='Training Set Error', linestyle='--')
plt.plot(results_test, label='Validation Set Error')
plt.xticks(np.arange(len(n_iteration)), [str(n) for n in n_iteration])
plt.xlabel('iterations')
plt.ylabel('cost function')
plt.legend()
plt.show()

**Model test on testset **

In [None]:
new_X = df.sample(3000).reset_index(drop=True)
new_y = new_X.shares
new_X.drop('shares', axis=1, inplace=True)

In [None]:
alpha = [1e-1, 2e-1, 3e-1, 4e-1, 5e-1, 6e-1, 7e-1, 8e-1, 9e-1, 1]#[1, 5e-1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
new_X = StandardScaler().fit_transform(new_X)

X_train, X_test, y_train, y_test = train_test_split(new_X, new_y, test_size=.2, random_state=50)

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_nrm = lin_reg.predict(X_test)
result_nrm = root_mean_squared_error(y_test, y_pred_nrm)

result_sgd = []
for a in alpha:
    reg = SGDRegressor(alpha=a, max_iter=100, random_state=50)
    
    reg.fit(X_train, y_train)
    y_pred_sgd = reg.predict(X_test)
    
    result_sgd.append(root_mean_squared_error(y_test, y_pred_sgd))

In [None]:
result_sgd[:,1]

In [None]:
plt.figure(figsize=[5,5])
plt.plot(result_sgd, label='SGD')
plt.axhline(result_nrm, linestyle='--', color='darkblue', label='Normal')
plt.xticks(np.arange(len(alpha)), [str(a) for a in alpha])
plt.legend()
plt.ylabel('cost function')
plt.xlabel(r'$\alpha$ value')
plt.show()

In [None]:
# Models
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler

lr = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
results = []

scale = True
if scale:
    sc = MinMaxScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.fit_transform(X_test)

for alpha in lr:
    model = SGDRegressor(alpha=alpha, loss='epsilon_insensitive', epsilon=1e-3, average=True, penalty='l2', max_iter=1500, random_state=50)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results.append(("SGDRegressor |  lr: " + str(alpha),(root_mean_squared_error(y_test, y_pred), model.score(X_test, y_test))))

for result in results:
    print(result[0], result[1])

In [None]:
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.layers.core import Dense, Activation
from keras.models import Sequential
from keras import metrics

my_opt = Adam()
model = Sequential()
model.add(Dense(30, activation='tanh', input_shape=(X.shape[1],)))
model.add(Dense(1))
model.compile(optimizer=my_opt, loss='mse')
model.fit(X, y, epochs=50)

df_test = pd.read_csv('../input/2018s1-mo444-assignment-01//test.csv')
df_test = df_test.drop(['url'], axis=1)

df_target = pd.read_csv('../input/2018s1-mo444-assignment-01//test_target.csv')
df_target.shares = np.log(df_target['shares'].values)

y_pred = model.predict(df_test)
print("Model 1 - RMSE Score: {}   -   r2 Score: {}".format(root_mean_squared_error(df_target.shares, y_pred),
                                                           model.score(df_test, df_target.shares)))