In [None]:
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from datetime import datetime

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
tweets = pd.read_csv('/content/drive/My Drive/EP/INF554/COVID-19/data/train.csv')
eval_ds = pd.read_csv('/content/drive/My Drive/EP/INF554/COVID-19/data/evaluation.csv')

In [None]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 665777 entries, 0 to 665776
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   id                    665777 non-null  int64 
 1   timestamp             665777 non-null  int64 
 2   retweet_count         665777 non-null  int64 
 3   user_verified         665777 non-null  bool  
 4   user_statuses_count   665777 non-null  int64 
 5   user_followers_count  665777 non-null  int64 
 6   user_friends_count    665777 non-null  int64 
 7   user_mentions         54291 non-null   object
 8   urls                  214080 non-null  object
 9   hashtags              72451 non-null   object
 10  text                  665777 non-null  object
dtypes: bool(1), int64(6), object(4)
memory usage: 51.4+ MB


In [None]:
tweets['user_statuses_count'].describe()

count    6.657770e+05
mean     4.167295e+04
std      9.848516e+04
min      0.000000e+00
25%      2.352000e+03
50%      1.080400e+04
75%      3.809900e+04
max      7.203222e+06
Name: user_statuses_count, dtype: float64

Cleaning the datas

In [None]:
def day(t):
    dt = datetime.fromtimestamp(t / 1000)
    return dt.day

def clean(df):
    X = df.copy()
    X.timestamp = X.timestamp.apply(day)
    X.user_verified = X.user_verified.fillna(False).astype(int)
    X.user_statuses_count = X.user_statuses_count.fillna(0)
    X.user_followers_count = X.user_followers_count.fillna(0)
    X.user_friends_count = X.user_friends_count.fillna(0)
    X.user_mentions = X.user_mentions.fillna('').apply(lambda x: int(bool(x)))
    X.hashtags = X.hashtags.fillna('').astype(bool).astype(int)
    X.text = X.text.fillna('')
    X.urls = X.urls.fillna('')
    X.urls = X.urls.apply(lambda x: int(bool(x))) + X.text.apply(lambda x: int(bool('http' in x)))
    X.urls = X.urls.apply(lambda x: 1 if x==2 else x)
    X.drop('id', axis=1, inplace=True)
    return X

Insert a column for categories of words that get a high number of retweets

In [None]:
def insert_or(words, title, df):
    if title not in df.columns:
        df.insert(0, title, df.text.apply(lambda x: int(bool([1 for w in words if w in x.lower()]))))
        
def insert_cat(df):
    X = df.copy()
    trump = ['president', 'trump', 'donald', 'dt']
    insert_or(trump, 'trump', X)
    virus = ['virus', 'corona', 'covid', 'sick', 'flu']
    insert_or(virus, 'virus', X)
    doctor = ['doctor', 'nurse', 'kid', 'hospital', 'administration', 'medical', 'save']
    insert_or(doctor, 'doctor', X)
    america = ['america', 'democrat']
    insert_or(america, 'america', X)
    china = ['chin', 'wuhan']
    insert_or(china, 'china', X)
    handwash = ['hand', 'wash']
    insert_or(handwash, 'handwash', X)
    return X

def insert_big_cat(df):
    X = df.copy()
    words = ['president', 'trump', 'donald', 'dt', 'virus', 'corona', 'covid', 'sick', 'flu', \
             'doctor', 'nurse', 'kid', 'hospital', 'administration', 'medical', 'save', 'america', \
             'democrat', 'chin', 'wuhan', 'hand', 'wash']
    insert_or(words, 'words', X)
    return X

def transform(df):
    X = df.copy()
    X = clean(X)
    X = insert_cat(X)
    if 'text' in X.columns:
        X.insert(0, 'words_count', X.text.apply(lambda x: min(140, len(x.split(' ')))))
        X.drop('text', axis=1, inplace=True)
    return X

In [None]:
X = tweets.copy().drop('retweet_count', axis=1)
y = tweets.retweet_count.copy()
Z = eval_ds.copy()
X = transform(X)
Z = transform(Z)
X.head()

Unnamed: 0,words_count,handwash,china,america,doctor,virus,trump,timestamp,user_verified,user_statuses_count,user_followers_count,user_friends_count,user_mentions,urls,hashtags
0,4,0,0,0,0,0,0,5,0,68460,1101,1226,0,0,0
1,14,0,0,0,0,0,0,3,0,309,51,202,0,0,0
2,21,0,0,0,0,0,1,4,0,3241,1675,2325,0,0,0
3,16,0,0,0,0,0,0,2,0,32327,667,304,0,0,0
4,15,0,0,0,0,0,0,4,0,581,42,127,0,0,0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.copy(), y.copy(), test_size=0.3, random_state=3)
#X_train = X.copy()
#X_test = Z.copy() # for submissions
#y_train = y.copy()

X_train_day = X_train.timestamp
X_train.drop('timestamp', axis=1, inplace=True)
X_test_day = X_test.timestamp
X_test.drop('timestamp', axis=1, inplace=True)

#X_train.drop('user_mentions', axis=1, inplace=True)
#X_test.drop('user_mentions', axis=1, inplace=True)

t1 = 10
t2 = 300
t3 = 1500
t4 = 3000
y_train_class = y_train.apply(lambda x: 0 if x <= t1 else 1 if x <= t2 else 2 if x <= t3 else 3 if x <= t4 else 4)
#y_train_class = y_train.apply(lambda x: 0 if x <= t1 else 1 if x <= t2 else 2 if x <= t3 else 3)

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=2, n_estimators=100)
rfc.fit(X_train, y_train_class)
X_test_class = rfc.predict(X_test)

CPU times: user 1min 56s, sys: 674 ms, total: 1min 57s
Wall time: 1min 57s


In [None]:
upper_bound = 23000
lower_bound = 1
X_train_reg = X_train[y_train >= lower_bound]
X_train_reg = X_train_reg[y_train[y_train >= lower_bound] < upper_bound]
y_train_reg = y_train[y_train >= lower_bound]
y_train_reg = y_train_reg[y_train[y_train >= lower_bound] < upper_bound]

In [None]:
%%time
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(random_state=2, n_estimators=100)
rfr.fit(X_train_reg, y_train_reg)
y_pred = rfr.predict(X_test)
tmp = y_pred.copy()

CPU times: user 2min 28s, sys: 771 ms, total: 2min 29s
Wall time: 2min 29s


In [None]:
def transform_pred(pred):
    y_pred = pred.copy()
    zero = mean_absolute_error(y_true=y_test, y_pred=y_pred*0)
    score = -10000
    i1, j1, k1 = 0, 0, 0
    y_pred[X_test_class == 0] = 0
    for i in np.linspace(0.9, 1.5, 20):
        for j in np.linspace(3, 18, 40):
            for k in np.linspace(-48, 40, 12):
                y_pred[X_test_class == 1] = (tmp[X_test_class == 1]**i) / j + k
                mae = mean_absolute_error(y_true=y_test, y_pred=y_pred)
                new_score = zero - mae
                if new_score > score:
                    score = new_score
                    i1 = i
                    j1 = j
                    k1 = k
    score = -10000
    i2, j2, k2 = 0, 0, 0
    y_pred = tmp.copy()
    y_pred[X_test_class == 0] = 0
    y_pred[X_test_class == 1] = (tmp[X_test_class == 1]**i1) / j1 + k1
    for i in np.linspace(0.9, 1.5, 20):
        for j in np.linspace(3, 18, 40):
            for k in np.linspace(0, 200, 12):
                y_pred[X_test_class == 2] = (tmp[X_test_class == 2]**i) / j + k
                mae = mean_absolute_error(y_true=y_test, y_pred=y_pred)
                new_score = zero - mae
                if new_score > score:
                    score = new_score
                    i2 = i
                    j2 = j
                    k2 = k
    score = -10000
    i3, j3, k3 = 0, 0, 0
    y_pred = tmp.copy()
    y_pred[X_test_class == 0] = 0
    y_pred[X_test_class == 1] = (tmp[X_test_class == 1]**i1) / j1 + k1
    y_pred[X_test_class == 2] = (tmp[X_test_class == 2]**i2) / j2 + k2
    for i in np.linspace(0.9, 1.5, 20):
        for j in np.linspace(3, 18, 40):
            for k in np.linspace(-500,500, 12):
                y_pred[X_test_class == 3] = (tmp[X_test_class == 3]**i) / j + k
                mae = mean_absolute_error(y_true=y_test, y_pred=y_pred)
                new_score = zero - mae
                if new_score > score:
                    score = new_score
                    i3 = i
                    j3 = j
                    k3 = k
    
    score = -10000
    i4, j4, k4 = 0, 0, 0
    y_pred = tmp.copy()
    y_pred[X_test_class == 0] = 0
    y_pred[X_test_class == 1] = (tmp[X_test_class == 1]**i1) / j1 + k1
    y_pred[X_test_class == 2] = (tmp[X_test_class == 2]**i2) / j2 + k2
    y_pred[X_test_class == 3] = (tmp[X_test_class == 3]**i3) / j3 + k3
    for i in np.linspace(-1.5, 1.5, 20):
        for j in np.linspace(-72, 3, 40):
            for k in np.linspace(-4000, 0, 12):
                y_pred[X_test_class == 4] = (tmp[X_test_class == 4]**i) / j + k
                mae = mean_absolute_error(y_true=y_test, y_pred=y_pred)
                new_score = zero - mae
                if new_score > score:
                    score = new_score
                    i4 = i
                    j4 = j
                    k4 = k
    print("score :", score)
    print('i1 :', i1, 'j1 :', j1, 'k1 :', k1, '\ni2 :', i2, 'j2 :', j2, 'k2 :', k2, \
          '\ni3 :', i3, 'j3 :', j3, 'k3 :', k3, '\ni4 :', i4, 'j4 :', j4, 'k4 :', k4)
    
    return i1, j1, k1, i2, j2, k2, i3, j3, k3 , i4, j4, k4

In [None]:
y_pred = tmp.copy()
%time i1, j1, k1, i2, j2, k2, i3, j3, k3, i4, j4, k4 = transform_pred(y_pred)
#%time i1, j1, k1, i2, j2, k2, i3, j3, k3 = transform_pred(y_pred)

score : 8.672433659666694
i1 : 1.0263157894736843 j1 : 9.153846153846153 k1 : 8.0 
i2 : 1.2473684210526317 j2 : 17.230769230769234 k2 : 90.90909090909092 
i3 : 1.0578947368421052 j3 : 3.0 k3 : 45.454545454545496 
i4 : 1.026315789473684 j4 : 1.0769230769230802 k4 : -1818.181818181818
CPU times: user 1min 35s, sys: 1.17 s, total: 1min 37s
Wall time: 1min 37s


In [None]:
y_pred[X_test_class == 0] = 0
y_pred[X_test_class == 1] = (tmp[X_test_class == 1]**i1) / j1 + k1
y_pred[X_test_class == 2] = (tmp[X_test_class == 2]**i2) / j2 + k2
y_pred[X_test_class == 3] = (tmp[X_test_class == 3]**i3) / j3 + k3
y_pred[X_test_class == 4] = (tmp[X_test_class == 4]**i4) / j4 + k4

In [None]:
mae = mean_absolute_error(y_true=y_test, y_pred=y_pred)
print("Prediction error on true datas :", mae)
zero = mean_absolute_error(y_true=y_test, y_pred=y_pred*0)
print("Constant zero prediction :", zero)
print('score :', zero - mae)

Prediction error on true datas : 142.79484281804866
Constant zero prediction : 151.46727647771536
score : 8.672433659666694


8.85339735118896 : clusters : 10 - 600 - 3000, upper lower : 20 000 - 1, n_esti = 200

In [None]:
def write_prediction_file(eval_data, y_pred):
    with open("random_forests.txt", 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["TweetID", "NoRetweets"])
        for index, prediction in enumerate(y_pred):
            writer.writerow([str(eval_data['id'].iloc[index]) , str(int(prediction))])

In [None]:
write_prediction_file(eval_ds, y_pred)