In [15]:
import csv
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.dummy import DummyRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit # pip install verstack
from nltk.corpus import stopwords
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/bruce/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Load the training data
train_data = pd.read_csv("train.csv")

In [11]:
train_data

Unnamed: 0,text,retweets_count,favorites_count,followers_count,statuses_count,friends_count,mentions,urls,verified,hashtags,timestamp,TweetID
0,rt refarcir macron ans nom prépare,3,0,3682,453535,3628,[],[],0,[],1646978048000,832509
1,populaire,0,0,86,1016,284,[],[],0,[],1647694288000,1388011
2,faut dégager cinglé,3,1,1944,28234,1995,[],[],0,[],1647370048000,63896
3,enseignants mettre prescriptions président rép...,0,0,1,1072,0,[],['https://t.co/rytlted08g'],0,[],1647256282000,979251
4,mafieuse oppressive macron,0,0,13957,25311,10841,[],[],0,[],1647258374000,1040049
...,...,...,...,...,...,...,...,...,...,...,...,...
353964,gonflette tour raciste frustré,0,0,34,1509,55,[],['https://t.co/pma33zhslx'],0,[],1647438153000,142573
353965,france caste crapuleuse encadrée gangsters irr...,0,0,89,11166,127,[],[],0,[],1647072106000,240866
353966,eric zemmour français berbère,3,0,1888,712,3086,[],[],0,[],1647607230000,1173763
353967,gauchistes dépression pq,0,0,139,486,320,[],[],0,[],1646987195000,929182


In [8]:
# Here we split our training data into trainig and testing set. This way we can estimate the evaluation of our model without uploading to Kaggle and avoid overfitting over our evaluation dataset.
# scsplit method is used in order to split our regression data in a stratisfied way and keep a similar distribution of retweet counts between the two sets
X_train, X_test, y_train, y_test = scsplit(train_data, train_data['retweets_count'], stratify=train_data['retweets_count'], train_size=0.7, test_size=0.3)

In [9]:
# We remove the actual number of retweets from our features since it is the value that we are trying to predict
X_train = X_train.drop(['retweets_count'], axis=1)
X_test = X_test.drop(['retweets_count'], axis=1)

In [10]:
X_train

Unnamed: 0,text,favorites_count,followers_count,statuses_count,friends_count,mentions,urls,verified,hashtags,timestamp,TweetID
264414,sondage zemmour malhonnêteté honte score candidat,9,3638,7769,807,[],['https://t.co/vwctahcvpy'],0,[],1647530854000,426270
250155,macron déjà pied tombe ans papy résistance,0,279,11310,556,[],['https://t.co/tqzllqffmh'],0,[],1647242508000,1084397
113967,contribue ratio kaeya bichon jvais sortir,0,470,35104,482,[],[],0,[],1647719559000,1373123
83546,npa ps sondage kantar philippe devance anne ca...,0,1260,180544,3944,[],['https://t.co/jbvrnk0srp'],0,"['poutou', 'hidalgo']",1646936415000,598508
330783,restera réponse zemmour violence ouvre bouche,0,2834,53059,2112,[],[],0,[],1647101957000,232704
...,...,...,...,...,...,...,...,...,...,...,...
207527,nooon voulais zemmour raciste mangas jv kurby pp,0,28,1594,2010,[],['https://t.co/dleo8f3jrq'],0,[],1647416744000,132434
272164,melenchon analyse poutine loin indignés salon ...,28,6,920,27,[],['https://t.co/obfccvxics'],0,['facealaguerretf1'],1647288865000,975674
6289,tue démocratie partis,0,787,10486,337,[],[],0,[],1647543503000,471008
251417,rt distraction concilions démocratie société,0,227,4603,454,[],[],0,[],1647072605000,260783


In [16]:
# We set up an Tfidf Vectorizer that will use the top 100 tokens from the tweets. We also remove stopwords.
# To do that we have to fit our training dataset and then transform both the training and testing dataset. 
vectorizer = TfidfVectorizer(max_features=100, stop_words=stopwords.words('french'))
X_train = vectorizer.fit_transform(X_train['text'])
X_test = vectorizer.transform(X_test['text'])

In [17]:
# Now we can train our model. Here we chose a Gradient Boosting Regressor and we set our loss function 
reg = GradientBoostingRegressor()#reg = RandomForestRegressor() #
#reg = LinearRegression()

# We fit our model using the training data
reg.fit(X_train, y_train)
# And then we predict the values for our testing set
y_pred = reg.predict(X_test)
# We want to make sure that all predictions are non-negative integers
y_pred = [int(value) if value >= 0 else 0 for value in y_pred]

print("Prediction error:", mean_absolute_error(y_true=y_test, y_pred=y_pred))

Prediction error: 26.3196975261557


In [18]:
###################################
# Once we finalized our features and model we can train it using the whole training set and then produce prediction for the evaluating dataset
###################################
# Load the evaluation data
eval_data = pd.read_csv("evaluation.csv")

In [19]:
# Transform our data into tfidf vectors
vectorizer = TfidfVectorizer(max_features=100, stop_words=stopwords.words('french'))
y_train = train_data['retweets_count']
X_train = vectorizer.fit_transform(train_data['text'])

In [20]:
# We fit our model using the training data
reg = GradientBoostingRegressor()
reg.fit(X_train, y_train)
X_val = vectorizer.transform(eval_data['text'])

In [21]:
# Predict the number of retweets for the evaluation dataset
y_pred = reg.predict(X_val)
y_pred = [int(value) if value >= 0 else 0 for value in y_pred]

In [23]:
# Dump the results into a file that follows the required Kaggle template
with open("gbr_predictions.txt", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "retweets_count"])
    for index, prediction in enumerate(y_pred):
        writer.writerow([str(eval_data['TweetID'].iloc[index]) , str(int(prediction))])

In [24]:
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X_train, y_train)
dummy_pred = dummy_regr.predict(X_val)
with open("mean_predictions.txt", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "retweets_count"])
    for index, prediction in enumerate(dummy_pred):
        writer.writerow([str(eval_data['TweetID'].iloc[index]) , str(int(prediction))])

In [25]:
dummy_regr = DummyRegressor(strategy="constant", constant=0)
dummy_regr.fit(X_train, y_train)
dummy_pred = dummy_regr.predict(X_val)