In [1]:
import datetime
import re
import pandas as pd
import numpy as np
from dateutil.parser import parse
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_parquet("mm.pq")
df.head()

Unnamed: 0,tweet_id,body,favorites,retweets,date_created
0,1095797317389692936,Building on the best elements of our own susta...,6,0,Wed Feb 13 21:30:14 +0000 2019
1,1095774709394874368,Looking for a summer job? There is an upcoming...,3,0,Wed Feb 13 20:00:23 +0000 2019
2,1095770370513227778,RT @MercyhurstHM: Need some sweet treats or g...,0,1,Wed Feb 13 19:43:09 +0000 2019
3,1095744507298136065,Our next #HurstValentines couple is Solveig an...,30,6,Wed Feb 13 18:00:23 +0000 2019
4,1095740403855355906,"@dfortu00 Hi Dominic, we are very sorry to hea...",0,0,Wed Feb 13 17:44:04 +0000 2019


In [3]:
df.date_created = df.date_created.apply(parse)
df.date_created = df.date_created.map(lambda x: x.replace(tzinfo=None))

In [4]:
df["interaction"] = df.favorites + df.retweets
df["hour"] = pd.to_numeric(df.date_created.dt.hour)
df["day"] = pd.to_numeric(df.date_created.dt.dayofyear)
df["day_of_week"] = pd.to_numeric(df.date_created.dt.dayofweek)
df["time_since"] = pd.to_numeric((datetime.datetime.now() - df.date_created).dt.total_seconds())
# get hours since seconds is too large
df["time_since"] = df["time_since"]/3600
df.head()

Unnamed: 0,tweet_id,body,favorites,retweets,date_created,interaction,hour,day,day_of_week,time_since
0,1095797317389692936,Building on the best elements of our own susta...,6,0,2019-02-13 21:30:14,6,21,44,2,137.956161
1,1095774709394874368,Looking for a summer job? There is an upcoming...,3,0,2019-02-13 20:00:23,3,20,44,2,139.453661
2,1095770370513227778,RT @MercyhurstHM: Need some sweet treats or g...,0,1,2019-02-13 19:43:09,1,19,44,2,139.740883
3,1095744507298136065,Our next #HurstValentines couple is Solveig an...,30,6,2019-02-13 18:00:23,36,18,44,2,141.453661
4,1095740403855355906,"@dfortu00 Hi Dominic, we are very sorry to hea...",0,0,2019-02-13 17:44:04,0,17,44,2,141.725605


In [5]:
df = df.drop(["tweet_id", "favorites", "retweets"], axis=1)
df = df.set_index("date_created")
df.head()

Unnamed: 0_level_0,body,interaction,hour,day,day_of_week,time_since
date_created,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-02-13 21:30:14,Building on the best elements of our own susta...,6,21,44,2,137.956161
2019-02-13 20:00:23,Looking for a summer job? There is an upcoming...,3,20,44,2,139.453661
2019-02-13 19:43:09,RT @MercyhurstHM: Need some sweet treats or g...,1,19,44,2,139.740883
2019-02-13 18:00:23,Our next #HurstValentines couple is Solveig an...,36,18,44,2,141.453661
2019-02-13 17:44:04,"@dfortu00 Hi Dominic, we are very sorry to hea...",0,17,44,2,141.725605


In [6]:
pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
combined_pat = r'|'.join((pat1,pat2))
www_pat = r'www.[^ ]+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def pre_processing(row):
    first_process = re.sub(combined_pat, '', row)
    second_process = re.sub(www_pat, '', first_process)
    third_process = second_process.lower()
    fourth_process = neg_pattern.sub(lambda x: negations_dic[x.group()], third_process)
    result = re.sub(r'[^A-Za-z ]','',fourth_process)
    return result.strip()

In [7]:
df.body = df.body.apply(pre_processing)
df.head()

Unnamed: 0_level_0,body,interaction,hour,day,day_of_week,time_since
date_created,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-02-13 21:30:14,building on the best elements of our own susta...,6,21,44,2,137.956161
2019-02-13 20:00:23,looking for a summer job there is an upcoming ...,3,20,44,2,139.453661
2019-02-13 19:43:09,rt need some sweet treats or gift for your h...,1,19,44,2,139.740883
2019-02-13 18:00:23,our next hurstvalentines couple is solveig and...,36,18,44,2,141.453661
2019-02-13 17:44:04,hi dominic we are very sorry to hear about thi...,0,17,44,2,141.725605


In [8]:
word_grams = TfidfVectorizer(analyzer = "word", ngram_range = (1, 3), stop_words="english")

word_vector = word_grams.fit_transform(df.body)

word_df = pd.DataFrame()

In [None]:
for i, col in enumerate(word_grams.get_feature_names()):
    word_df[col] = pd.Series(word_vector[:, i].toarray().ravel())

In [None]:
df = pd.merge(df, word_df, left_index=True, right_index=True) 

In [None]:
del word_df

df = df.drop(["body"], axis=1)

df.head()

In [None]:
x_data = df.drop(["interaction"], axis=1)
y_data = df.interaction.values

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.10)