In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import classification_report
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn import neighbors
from sklearn.svm import SVC
from sklearn import svm
from sklearn import grid_search
import random

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')



In [12]:
full_df = pd.read_csv("social_influencers.csv")

In [13]:
df = full_df.iloc[:, :12]

In [14]:
df.head()

Unnamed: 0,Choice,A_follower_count,A_following_count,A_listed_count,A_mentions_received,A_retweets_received,A_mentions_sent,A_retweets_sent,A_posts,A_network_feature_1,A_network_feature_2,A_network_feature_3
0,0,228,302,3,0.583979,0.100503,0.100503,0.100503,0.36215,2,166.5,11355.0
1,0,21591,1179,228,90.456506,25.798292,5.709329,1.111159,5.17662,369,18.442971,1330.366048
2,0,7310,1215,101,25.503644,9.556347,5.361519,0.591206,3.589718,95,68.927835,5999.896907
3,0,20,7,2,7.690824,0.277306,1.331508,0.100503,2.830627,6,2.0,96.166667
4,1,45589,862,2641,148.854279,36.998884,27.881768,3.333492,23.861282,551,127.404293,2833.847943


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5500 entries, 0 to 5499
Data columns (total 12 columns):
Choice                 5500 non-null int64
A_follower_count       5500 non-null int64
A_following_count      5500 non-null int64
A_listed_count         5500 non-null int64
A_mentions_received    5500 non-null float64
A_retweets_received    5500 non-null float64
A_mentions_sent        5500 non-null float64
A_retweets_sent        5500 non-null float64
A_posts                5500 non-null float64
A_network_feature_1    5500 non-null int64
A_network_feature_2    5500 non-null float64
A_network_feature_3    5500 non-null float64
dtypes: float64(7), int64(5)
memory usage: 515.7 KB


In [16]:
# List of 100 random numbers (no repeats) between 1 and 5500 and then delete A_follower_count[that row]   

In [17]:
rows_to_delete = random.sample(range(5500), 100)

In [18]:
# Deleting values from those rows
for x in rows_to_delete:
    df['A_follower_count'][x] = np.nan

In [19]:
# Making sure 100 values were removed from that column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5500 entries, 0 to 5499
Data columns (total 12 columns):
Choice                 5500 non-null int64
A_follower_count       5400 non-null float64
A_following_count      5500 non-null int64
A_listed_count         5500 non-null int64
A_mentions_received    5500 non-null float64
A_retweets_received    5500 non-null float64
A_mentions_sent        5500 non-null float64
A_retweets_sent        5500 non-null float64
A_posts                5500 non-null float64
A_network_feature_1    5500 non-null int64
A_network_feature_2    5500 non-null float64
A_network_feature_3    5500 non-null float64
dtypes: float64(8), int64(4)
memory usage: 515.7 KB


In [20]:
# Need to first split the data into complete/not-complete, then split complete into target/features, train
# it, then run that on the empty ones 

In [21]:
# Splitting into complete and non-complete

In [22]:
df_complete_follower_count = df.dropna()

In [23]:
df_complete_follower_count.shape

(5400, 12)

In [24]:
df_empty_follower_count = df.loc[df.isnull().any(axis=1), df.isnull().any()]

In [25]:
df_empty_follower_count.shape

(100, 1)

In [26]:
# Splitting complete into target/features

In [27]:
X = df_complete_follower_count.drop('A_follower_count', axis=1)

In [28]:
X.head()

Unnamed: 0,Choice,A_following_count,A_listed_count,A_mentions_received,A_retweets_received,A_mentions_sent,A_retweets_sent,A_posts,A_network_feature_1,A_network_feature_2,A_network_feature_3
0,0,302,3,0.583979,0.100503,0.100503,0.100503,0.36215,2,166.5,11355.0
1,0,1179,228,90.456506,25.798292,5.709329,1.111159,5.17662,369,18.442971,1330.366048
2,0,1215,101,25.503644,9.556347,5.361519,0.591206,3.589718,95,68.927835,5999.896907
3,0,7,2,7.690824,0.277306,1.331508,0.100503,2.830627,6,2.0,96.166667
4,1,862,2641,148.854279,36.998884,27.881768,3.333492,23.861282,551,127.404293,2833.847943


In [29]:
Y = df_complete_follower_count['A_follower_count']

In [30]:
Y.head()

0      228.0
1    21591.0
2     7310.0
3       20.0
4    45589.0
Name: A_follower_count, dtype: float64

In [31]:
# Training various models - the type of model we use will be dependent on if the value is categorical or not...

In [32]:
linreg_follower_count = LinearRegression()
linreg_follower_count.fit(X, Y)

linreg_score=linreg_follower_count.score(X, Y)
print ('R_squared:',linreg_score)

R_squared: 0.815877510176


In [33]:
# Try out lasso

In [34]:
from sklearn.linear_model import Ridge
from sklearn.utils import shuffle
from sklearn.linear_model import Lasso

In [35]:
ridgereg = Ridge(normalize=True)
ridgereg.fit(X,Y)
y_pred = ridgereg.predict(X)

In [36]:
ridgereg.score(X, Y)

0.54997203239150994

In [37]:
lassoreg = Lasso(normalize=True)
lassoreg.fit(X,Y)
y_pred = lassoreg.predict(X)

In [38]:
lassoreg.score(X, Y)

0.81587720304064004