In [54]:
# Libraries for data loading, data manipulation and data visulisation
import numpy as np
import pandas as pd
#
import string
import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sns
from statsmodels.graphics.correlation import plot_corr
from scipy.stats import skew
from scipy.stats import kurtosis
import statistics

# datetime
import datetime

# Libraries for data preparation and model building
from sklearn.pipeline import Pipeline
import statsmodels.formula.api as sm
from statsmodels.formula.api import ols
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import boxcox, zscore
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, PolynomialFeatures

# saving my model
import pickle

#ignoring warnings
import warnings
warnings.filterwarnings('ignore')


In [55]:
#making sure that we can see all rows and cols
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [56]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test_with_no_labels.csv')

In [57]:
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
df_train['message'] = df_train['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [58]:
def lower_case(df, column_name):
    
    df[column_name] = df[column_name].str.lower()
    
    return df

In [59]:
lower_case(df_train, 'message').head()

Unnamed: 0,sentiment,message,tweetid
0,1,"polyscimajor epa chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? url-web via @mashable",625221
1,1,it's not like we lack evidence of anthropogenic global warming,126103
2,2,rt @rawstory: researchers say we have three years to act on climate change before it’s too late url-web url-web…,698562
3,1,#todayinmaker# wired : 2016 was a pivotal year in the war on climate change url-web,573736
4,1,"rt @soynoviodetodas: it's 2016, and a racist, sexist, climate change denying bigot is leading in the polls. #electionnight",466954


In [96]:
def lookup_dict(text, dictionary):
    
    for word in text.split():
        
        if word in dictionary:
            
            if word in text.split():
                
                text = text.replace(word, dictionary[word])
                
    return text

In [101]:
contractions = {"doesn't" : 'does not','wouldn\'t': 'would not','it\'s' : 'it is',
               'i\'m': 'i am', 'we\'re' : 'we are','i\'ve':'i have','let\'s' : 'let us',
               'couldn\'t' : 'could not','don\'t' : 'do not', 'lol' : 'laugh out loud',
               'ftl': 'for the loss', 'fwiw': 'for what it is worth', 'imo' : 'in my opinion',
               'diaf': 'die in a fire','dm': 'direct message', 'afaik':'as far as i know',
               'imho': 'in my humble opinion', 'tbh': 'to be honest','icymi': 'in case you missed it',
               'idk': 'i do not know', 'mt': 'modified tweet', 'smh':'shaking my head',
               'smdh':'shaking my damn head','nts':'note to self','ifykyk':'if you know, you know',
               'ijs':'i am just saying', 'tbqh':'to be quite honest','fyi':'for your information',
               'idc':'i do not care','hth':'happy to help', 'hth':'hear to help','hifw':'how i feel when',
               'we\'ve':'we have','i\'d':'i would', 'i\'ll':'i will'}

In [102]:
df_train['clean_message'] = df_train['message'].apply(lambda x: lookup_dict(x, contractions))

In [103]:
df_train.head()

Unnamed: 0,sentiment,message,tweetid,clean_message
0,1,polyscimajor epa chief doesnt think carbon dioxide is main cause of global warming and wait what urlweb via mashable,625221,polyscimajor epa chief doesnt think carbon dioxide is main cause of global warming and wait what urlweb via mashable
1,1,its not like we lack evidence of anthropogenic global warming,126103,its not like we lack evidence of anthropogenic global warming
2,2,rt rawstory researchers say we have three years to act on climate change before it’s too late urlweb urlweb…,698562,rt rawstory researchers say we have three years to act on climate change before it’s too late urlweb urlweb…
3,1,todayinmaker wired 2016 was a pivotal year in the war on climate change urlweb,573736,todayinmaker wired 2016 was a pivotal year in the war on climate change urlweb
4,1,rt soynoviodetodas its 2016 and a racist sexist climate change denying bigot is leading in the polls electionnight,466954,rt soynoviodetodas its 2016 and a racist sexist climate change denying bigot is leading in the polls electionnight


In [60]:
def remove_punctuation(message):
    return ''.join([l for l in message if l not in string.punctuation])

In [61]:
df_train['message'] = df_train['message'].apply(remove_punctuation)

In [62]:
df_train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,polyscimajor epa chief doesnt think carbon dioxide is main cause of global warming and wait what urlweb via mashable,625221
1,1,its not like we lack evidence of anthropogenic global warming,126103
2,2,rt rawstory researchers say we have three years to act on climate change before it’s too late urlweb urlweb…,698562
3,1,todayinmaker wired 2016 was a pivotal year in the war on climate change urlweb,573736
4,1,rt soynoviodetodas its 2016 and a racist sexist climate change denying bigot is leading in the polls electionnight,466954


In [84]:
sent_list = []

message = []

for count in range(0,len(df_train)):
    
    if df_train['sentiment'][count] == 1:
        
        sent_list.append(1)
        message.append(df_train['message'][count])
        
    elif df_train['sentiment'].iloc[count] == -1:
                   
        sent_list.append(1)
        message.append(df_train['message'][count])

In [87]:
dict_ = {'sentiment': sent_list,
        'message' : message}

In [88]:
data = pd.DataFrame(dict_)

Unnamed: 0,sentiment,message
9821,1,rt stephenschlegel shes thinking about how shes going to die because your husband doesnt believe in climate change urlwebã¢â‚¬â¦
9822,1,rt sierraclub 2016 hottest year in history also in 2016 182 members of congress denied climate change is real urlweb…
9823,1,rt thinkprogress epa head falsely claims carbon emissions aren’t the cause of global warming\nurlweb urlweb…
9824,1,rt ezlusztig they took down the material on global warming lgbt rights and health care but now theyre hocking melanias qvc https…
9825,1,rt sara8smiles hey liberals the climate change crap is a hoax that ties to agenda2030\nthe climate is being changed byã¢â‚¬â¦


In [63]:
df = data.message.values

y = data.sentiment.values

In [64]:
x_train, x_test, y_train, y_test = train_test_split(df,
                                                    y,
                                                   stratify = y, 
                                                   random_state = 1,
                                                   test_size = 0.2,
                                                   shuffle = True)

In [91]:
vectorizer = CountVectorizer(binary = True,stop_words = 'english')

vectorizer.fit(list(x_train) + list(x_test))

CountVectorizer(binary=True, stop_words='english')

In [92]:
X_train = vectorizer.transform(x_train)
X_test = vectorizer.transform(x_test)

In [93]:
svm = SVC(kernel = 'linear', probability = True)

prob = svm.fit(X_train,y_train).predict_proba(X_test)

y_pred = svm.predict(X_test)

In [94]:
kwanda = f1_score(y_test, y_pred, average='macro')

In [95]:
kwanda

0.6682502823227956