In [None]:
import torch
import tensorflow as tf
import numpy as np
import pandas as pd
import csv
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Activation, Dropout, Dense, Input, Conv1D, MaxPooling1D, Flatten, BatchNormalization, LeakyReLU,concatenate
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor



#only if you are using google clollab
from google.colab import drive
drive.mount('/content/drive')
#end of block

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #if you have a GPU with CUDA installed, this may speed up computation


# Load the training data
#data = pd.read("train.csv")

#For running the deep learning model might need to take less than the whole dataset otherwise training takes too much time
data = pd.read_csv("/content/drive/MyDrive/DATA_ML/train.csv")


  

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
eval_data = pd.read_csv("/content/drive/MyDrive/DATA_ML/evaluation.csv")
X_off_pipeline = eval_data.loc[:, ~eval_data.columns.isin(['urls','TweetID'])] #Run with pipeline
X_off_Neural_Net = eval_data.loc[:, ~eval_data.columns.isin(['urls','hashtags','mentions','TweetID'])]
X_off_XGB = eval_data.drop(['text', 'urls', 'mentions', 'hashtags'], axis=1)

#----------------------------------------------------------------------Function to submit prediction-------------------------------------------------

def submit(nameFile, y_pred):
  with open(nameFile, 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "retweets_count"])
    for index, prediction in enumerate(y_pred):
        writer.writerow([str(eval_data['TweetID'].iloc[index]) , str(int(prediction))])











In [None]:
#-----------------------------------------------------------------------Pipeline Model---------------------------------------------------------




#Transormers 
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),('onehot', OneHotEncoder(handle_unknown='ignore'))])

X = data
y = data.loc[:,['retweets_count']]
#Splitting the data into test-train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)


#selecting the features we care about
numeric_features = data.select_dtypes(include=['int64', 'float64']).drop(['TweetID', 'retweets_count'], axis = 1).columns
categorical_features = data.select_dtypes(include=['object']).drop(['urls'], axis=1).columns



#Using a column transformer to preprocess the data as specified by the above feature selector
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),('cat', categorical_transformer, categorical_features)])


params = {
    "n_estimators": 850,
    "learning_rate": 0.05,
    "loss": "absolute_error",
}


#Can be changed to anything such as RandomForestRegressor(), KNeighborsRegressor(), DecisionTreeRegressor() 
classifiers = [GradientBoostingRegressor(**params)]


#Creating our pipeline model
rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifiers[0])])

#fit on X_train and y_train
rf.fit(X_train, y_train.values.ravel())
#Getting the prediction
y_pred = rf.predict(X_test)
  
print("Prediction error:", mean_absolute_error(y_true=y_test, y_pred=y_pred))








In [None]:
#-----------------------------------------------------------------------------Evaluate with pipeline---------------------------------------------------
y_pred_pipeline = rf.predict(X_off_pipeline)
submit("/content/drive/MyDrive/DATA_ML/Prediction-Pipeline-GradientBooster.txt", y_pred_pipeline)

In [None]:
#----------------------------------------------------------------------------Function to easily get data-------------------------------------------


def getXY_tot(df):
  X = df.loc[:, ~df.columns.isin(['urls','hashtags','mentions','retweets_count','TweetID'])]
  y = df.loc[:,['retweets_count']]
  return X, y

def normalize(P):
  means = [P[col].mean() for col in P]
  stds = [P[col].std() for col in P]

  return (P-means)/stds

In [None]:
#--------------------------------------------------------------------------Splitting the data into textual and numerical data---------------------------------

X, y = getXY_tot(data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)
X_train_t = X_train.loc[:,['text']]
X_train_n = X_train.loc[:, ~X_train.columns.isin(['text'])]
X_test_t = X_test.loc[:,['text']]
X_test_n = X_test.loc[:, ~X_test.columns.isin(['text'])]

In [None]:
#--------------------------------------------------------------------------Creating our Embedding Layer for the CNN------------------------------------------------

tmp = X['text']
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tmp)
def embedding_for_vocab(filepath, word_index,embedding_dim):
    vocab_size = len(word_index) + 1
    # Adding again 1 because of reserved 0 index
    embedding_matrix_vocab = np.zeros((vocab_size,embedding_dim))
    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix_vocab[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]
    return embedding_matrix_vocab


def getEmbeddingLayer(embedding_dim, maxLen,tokenizer):
  embedding_matrix_vocab = embedding_for_vocab('/content/drive/MyDrive/DATA_ML/glove.6B/glove.6B.50d.txt', tokenizer.word_index,embedding_dim)
  embed_vector_len  = embedding_matrix_vocab[0].shape[0]
  words_to_index = tokenizer.word_index
  vocab_len = len(words_to_index)+1
  emb_matrix = np.zeros((vocab_len, embed_vector_len))
  for word, index in words_to_index.items():
    embedding_vector = embedding_matrix_vocab[index]
    if embedding_vector is not None:
      emb_matrix[index, :] = embedding_vector
  embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)
  return embedding_layer

maxLen = 80
embedding_dim = 50
embedding_layer = getEmbeddingLayer(embedding_dim,maxLen,tokenizer)


In [None]:
#---------------------------------------------------------------------------Concatenated Model-----------------------------------------------------------



def concatModel(in_shape_t, in_shape_n, hidde_dim, out_shape):
  x_in_t = Input(shape=in_shape_t)
  x = embedding_layer(x_in_t)
  x = ((Conv1D(filters=64, kernel_size=2, padding='same', activation='relu')))(x)
  x = (BatchNormalization())(x)
  x = ((Conv1D(filters=64, kernel_size=2, padding='same', activation='relu')))(x)
  x = (BatchNormalization())(x)
  x = ((Conv1D(filters=64, kernel_size=2, padding='same', activation='relu')))(x)
  x = (BatchNormalization())(x)
  x = ((Conv1D(filters=64, kernel_size=2, padding='same', activation='relu')))(x)
  x = (BatchNormalization())(x)
  x = (MaxPooling1D(pool_size=2))(x)
  x = Flatten()(x)
  x = Dense(64, activation = 'relu')(x)
  x = (BatchNormalization())(x)
  x = Dense(64, activation = 'relu')(x)
  x = (BatchNormalization())(x)
  x = Dense(64, activation = 'relu')(x)
  x = (BatchNormalization())(x)
  x = Dropout(0.5)(x)
  x_out = Dense(out_shape, activation = 'relu')(x)
  
  x_in_n = Input(shape=in_shape_n)
  xx = Dense(hidde_dim)(x_in_n)
  xx = LeakyReLU(alpha=0.5)(xx)
  xx = Dense(hidde_dim)(xx)
  xx = LeakyReLU(alpha=0.5)(xx)
  xx = Dense(hidde_dim)(xx)
  xx_out = LeakyReLU(alpha=0.5)(xx)


  merged = concatenate([x_out,xx_out],name="concatenated_layer")
  output_layer = Dense(out_shape, activation = "relu", name = "output_layer")(merged)
  model = Model(inputs = [x_in_t, x_in_n], outputs = output_layer)

  return model





In [None]:
#----------------------------------------------------------------------Training the Model----------------------------------------------


tmp = X_train_t['text']
X_train_indices = tokenizer.texts_to_sequences(tmp)
X_train_indices = pad_sequences(X_train_indices, maxlen=maxLen, padding='post')
X_train_n_norm = normalize(X_train_n)
X_test_n_norm = normalize(X_test_n)
adam = tf.keras.optimizers.Adam(learning_rate = 0.01)
model_concat = concatModel((maxLen,), (X_train_n_norm.shape[1],), 256, y_train.shape[1])
model_concat.compile(optimizer=adam, loss='mean_absolute_error', metrics=['accuracy'])
model_concat.fit([X_train_indices, X_train_n_norm], y_train, batch_size = 1024, epochs = 100)

In [None]:
#--------------------------------------------------------------------------Testing model-------------------------------------------------------------

tmp = X_test_t['text']
X_test_indices = tokenizer.texts_to_sequences(tmp)
X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')
pred_n = model_concat.predict([X_test_indices,X_test_n_norm])
pred_n = [int(value) if value >= 0 else 0 for value in pred_n]
print("Prediction error:", mean_absolute_error(y_true=y_test, y_pred=pred_n))

In [None]:
#--------------------------------------------------------------------------Evaluate with Neural Network----------------------------------------------

X_off_Neural_Net_t = X_off_Neural_Net.loc[:,['text']]
X_off_Neural_Net_n = X_off_Neural_Net.loc[:, ~X_off_Neural_Net.columns.isin(['text'])]
tmp_t = X_off_Neural_Net_t['text']
X_off_indices = tokenizer.texts_to_sequences(tmp_t)
X_off_indices = pad_sequences(X_off_indices, maxlen=maxLen, padding='post')
y_pred_Neural_Net = model_concat.predict([X_off_indices, X_off_Neural_Net_n])
submit("/content/drive/MyDrive/DATA_ML/Prediction-Neural-Net.txt", y_pred_Neural_Net)



In [None]:
#-------------------------------------------------------------------------XGBooster----------------------------------------------------------
import pickle
import xgboost as xgb
from os.path import exists
from xgboost import XGBRegressor


def train():
    X = data.drop(['text', 'retweets_count', 'urls', 'mentions', 'hashtags'], axis=1)
    Y = data['retweets_count']
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, shuffle=True)
    evalset = [(X_train, Y_train), (X_test,Y_test)]
    regressor = XGBRegressor(objective = 'reg:linear' , learning_rate=0.1, max_depth = 3, n_estimators = 360)
    regressor.fit(X_train, Y_train,eval_metric = 'mae', eval_set=evalset)
    predictions = regressor.predict(X_test)
    results = regressor.evals_result()
    plt.plot(results['validation_0']['mae'], label='train')
    plt.plot(results['validation_1']['mae'], label='test')
    plt.legend()
    plt.show()
    error = mean_absolute_error(y_true=Y_test, y_pred=predictions)
    print('Prediction error:', error)
    return regressor


def test(regressor, X):
    predictions = regressor.predict(X)
    return predictions




In [None]:
regressor = train()
y_pred_XGB = test(regressor, X_off_XGB)
submit("/content/drive/MyDrive/DATA_ML/Prediction-XGB.txt", y_pred_XGB)