# This notebook contains an example of a complete execution using classes in .py files

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import importlib
import sys
import pickle

from verstack.stratified_continuous_split import scsplit 

In [None]:
# If there is a change in .py files while jupyter is running, run this cell
import feature_extraction, classification, regression, prediction

importlib.reload(sys.modules['feature_extraction'])
importlib.reload(sys.modules['classification'])
importlib.reload(sys.modules['regression']);
importlib.reload(sys.modules['prediction']);

In [None]:
from feature_extraction import FeatureExtraction, FeatureExtraction_Text
from classification import Classification, classifier_performance 
from regression import Regression, model_performance
from prediction import transform_eval, pred_eval, save_pred

## 1. Feature selection and extraction

### • First computation for trainset transformation

If you are using this notebook for the first time, run these cells. 

In [5]:
train_data = pd.read_csv('data/train.csv')

In [6]:
FE = FeatureExtraction(train_data)
FE.transform()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=665777.0), HTML(value='')))




In [7]:
max_features=100
dim_pca=20
FE_text = FeatureExtraction_Text(df=train_data, max_features=max_features, dim_pca=dim_pca)
FE_text.fit()

In [8]:
#save FE_text for eval prediction
pickle.dump(FE_text, open('FeatureExtraction_Text_train','wb'))

In [9]:
FE_text.transform(df=train_data)

In [10]:
features_df = pd.concat([FE.transformed_df, FE_text.transformed_df], axis=1)
features_df.to_csv("data/train_features.csv", index=False)

In [14]:
'''
map = pd.DataFrame(FE_text.pca.components_, index=range(1,dim_pca+1), columns=FE_text.vectorizer.get_feature_names())
plt.figure(figsize=(20,8))
sns.heatmap(map,cmap='Reds')
plt.show()
''';

### • Resume with already-calculated features

Instead of transforming the data again, we import the already-calculated features (for details check the FeatureExtraction class).

In [None]:
features_df = pd.read_csv("data/train_features.csv", index_col=0)

### Choose features

In [15]:
dim_pca = 20
features = ["user_verified", "user_statuses_count", "user_followers_count", "user_friends_count","num_hashtags", "num_mentions", "positive", "neutral", "negative", "length", "hashtag_score"]
features.extend(['PCA'+str(i) for i in range(1,dim_pca+1)])

## 2. Train / Test Split

In [16]:
def label(count, bins):
    """Assign a class to a number of retweets.

    Args:
        count (int): number of retweets 
        bins (list): list of thresholds
    Returns:
        int: class number
    """
    
    for i, elm in enumerate(bins):
        if count <= elm:
            return i
    return i+1 

In [24]:
bins = [0] #only two classes: 0 or non
features_df["class"] = features_df["retweet_count"].apply(lambda x: label(x, bins))

X = pd.concat([features_df[features], features_df["retweet_count"]], axis=1)
Y = features_df[["class"]] 

X_train, X_test, y_train, y_test = scsplit(X, Y["class"], stratify=Y["class"], test_size=0.2, random_state=0, continuous=False)

y_train_class = pd.DataFrame(y_train, columns=["class"])
y_test_class = pd.DataFrame(y_test, columns=["class"])

y_train = X_train["retweet_count"]
y_test = X_test["retweet_count"]

X_train = X_train[features]
X_test = X_test[features]
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

## 3. Classification based on retweet_count

In [25]:
classifier = Classification(bins)
classifier.classify(X_train, y_train_class)

In [None]:
importances = classifier.model.feature_importances_
indices = (-importances).argsort()[:10]

fig = plt.figure(figsize=(8, 5))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

## 4. Regression

In [27]:
regressor = Regression(bins = [5e2,1e3,5e3,1e4,5e4,1e5,5e5,1e6,5e6,1e7])
regressor.regression_per_class(X_train, y_train, features)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=11.0), HTML(value='')))




## 5. Performance

In [26]:
classifier_performance(classifier, X_test, y_test_class)

              precision    recall  f1-score   support

           0       0.96      0.98      0.97     84561
          +0       0.96      0.93      0.95     48595

    accuracy                           0.96    133156
   macro avg       0.96      0.96      0.96    133156
weighted avg       0.96      0.96      0.96    133156



In [43]:
model_performance(classifier, regressor, features, X_test, y_test)

Prediction error (MAE):
0 : 36.68 - 85877
For non zero prediction, regression based on user_followers_count:
≤ 500.0 : 57.32 - 6786
≤ 1000.0 : 90.25 - 2941
≤ 5000.0 : 217.44 - 10181
≤ 10000.0 : 146.08 - 4671
≤ 50000.0 : 260.78 - 9251
≤ 100000.0 : 355.1 - 3054
≤ 500000.0 : 490.64 - 5275
≤ 1000000.0 : 855.68 - 1499
≤ 5000000.0 : 976.36 - 2310
≤ 10000000.0 : 858.07 - 646
+10000000.0 : 2466.55 - 665
Overall: 139.06931351612292


## 6. Predictions on eval data

### • First computation for evalset transformation

In [31]:
eval_data = pd.read_csv("data/evaluation.csv")

In [32]:
eval_df = transform_eval(eval_data)
eval_df.to_csv("data/eval_features.csv", index=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=285334.0), HTML(value='')))




### • Continue

Same as before, to save time we import already transformed evaluation data.

In [34]:
eval_df = pd.read_csv("data/eval_features.csv")

In [39]:
y_pred_eval = pred_eval(classifier, regressor, eval_df[features])

In [40]:
y_pred_eval.describe()

Unnamed: 0,pred
count,285334.0
mean,20.551865
std,317.949479
min,0.0
25%,0.0
50%,0.0
75%,2.876603
max,34238.924089


In [46]:
save_pred(eval_df, y_pred_eval, filename="predictions.txt")

## 7. Save models

In [29]:
def save_model(classifier, regressor):
    pickle.dump(classifier, open("saved_models/classifier", 'wb'))
    pickle.dump(regressor, open("saved_models/regressor",'wb'))

In [30]:
save_model(classifier, regressor)