In [24]:
import re
import string

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
import plotly.tools as tls
import seaborn as sns
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, make_scorer
from sklearn.model_selection import train_test_split

In [25]:
df = pd.read_csv("/home/ivelasquez/Full Dataset  - Even & Cleaned.csv")
df.head()

Unnamed: 0,name,message,Virality
0,Photos from CNN's post,At least 22 hours after the earthquake struck ...,Viral
1,Timeline Photos,Peace among protest: A Portland police officer...,Viral
2,WWII Vet Reunites With Man He Saved From Conce...,EMOTIONAL REUNION: WWII veteran reunited with ...,Viral
3,Timeline Photos,A teenager and his mom were driving home when ...,Viral
4,Opposing protesters meet in Dallas,What happened when a Black Lives Matter protes...,Viral


In [26]:
df.name.fillna('')
df.message.fillna('')
#df.description.fillna('')
df["name"] = df["name"].astype(str)
df["message"] = df["message"].astype(str)
#df["description"] = df["description"].astype(str)
df.head()

Unnamed: 0,name,message,Virality
0,Photos from CNN's post,At least 22 hours after the earthquake struck ...,Viral
1,Timeline Photos,Peace among protest: A Portland police officer...,Viral
2,WWII Vet Reunites With Man He Saved From Conce...,EMOTIONAL REUNION: WWII veteran reunited with ...,Viral
3,Timeline Photos,A teenager and his mom were driving home when ...,Viral
4,Opposing protesters meet in Dallas,What happened when a Black Lives Matter protes...,Viral


In [27]:
#Text Processing - Lower casing
df["name"] = df["name"].str.lower()
df["message"] = df["message"].str.lower()
#df["description"] = df["description"].str.lower()
df.head()

Unnamed: 0,name,message,Virality
0,photos from cnn's post,at least 22 hours after the earthquake struck ...,Viral
1,timeline photos,peace among protest: a portland police officer...,Viral
2,wwii vet reunites with man he saved from conce...,emotional reunion: wwii veteran reunited with ...,Viral
3,timeline photos,a teenager and his mom were driving home when ...,Viral
4,opposing protesters meet in dallas,what happened when a black lives matter protes...,Viral


In [28]:
#Remove Frequent Words
from collections import Counter
cnt = Counter()
for text in df["name"].values:
    for word in text.split():
        cnt[word] += 1
for text in df["message"].values:
    for word in text.split():
        cnt[word] += 1
#for text in df["description"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

[('the', 113442),
 ('to', 63475),
 ('a', 60896),
 ('of', 55531),
 ('in', 53620),
 ('and', 40389),
 ('for', 25705),
 ('on', 23108),
 ('is', 19671),
 ('that', 16569)]

In [29]:
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

df["name"] = df["name"].apply(lambda text: remove_freqwords(text))
df["message"] = df["message"].apply(lambda text: remove_freqwords(text))
#df["description"] = df["description"].apply(lambda text: remove_freqwords(text))
df.head()

Unnamed: 0,name,message,Virality
0,photos from cnn's post,at least 22 hours after earthquake struck nepa...,Viral
1,timeline photos,peace among protest: portland police officer n...,Viral
2,wwii vet reunites with man he saved from conce...,emotional reunion: wwii veteran reunited with ...,Viral
3,timeline photos,teenager his mom were driving home when he sud...,Viral
4,opposing protesters meet dallas,what happened when black lives matter protest ...,Viral


In [30]:
#Remove Punctuation
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

    

In [31]:
df["name"] = df["name"].apply(lambda text: remove_punctuation(text))
df["message"] = df["message"].apply(lambda text: remove_punctuation(text))
#df["description"] = df["description"].apply(lambda text: remove_punctuation(text))
df.head()

Unnamed: 0,name,message,Virality
0,photos from cnns post,at least 22 hours after earthquake struck nepa...,Viral
1,timeline photos,peace among protest portland police officer no...,Viral
2,wwii vet reunites with man he saved from conce...,emotional reunion wwii veteran reunited with h...,Viral
3,timeline photos,teenager his mom were driving home when he sud...,Viral
4,opposing protesters meet dallas,what happened when black lives matter protest ...,Viral


In [32]:
#Remove rare words
n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    """custom function to remove the rare words"""
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

df["name"] = df["name"].apply(lambda text: remove_rarewords(text))
df["message"] = df["message"].apply(lambda text: remove_rarewords(text))

#df["description"] = df["description"].apply(lambda text: remove_rarewords(text))
df.head()



Unnamed: 0,name,message,Virality
0,photos from cnns post,at least 22 hours after earthquake struck nepa...,Viral
1,timeline photos,peace among protest portland police officer no...,Viral
2,wwii vet reunites with man he saved from conce...,emotional reunion wwii veteran reunited with h...,Viral
3,timeline photos,teenager his mom were driving home when he sud...,Viral
4,opposing protesters meet dallas,what happened when black lives matter protest ...,Viral


In [33]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

df["name"] = df["name"].apply(lambda text: remove_urls(text))
df["message"] = df["message"].apply(lambda text: remove_urls(text))
#df["description"] = df["description"].apply(lambda text: remove_urls(text))
df.head()

Unnamed: 0,name,message,Virality
0,photos from cnns post,at least 22 hours after earthquake struck nepa...,Viral
1,timeline photos,peace among protest portland police officer no...,Viral
2,wwii vet reunites with man he saved from conce...,emotional reunion wwii veteran reunited with h...,Viral
3,timeline photos,teenager his mom were driving home when he sud...,Viral
4,opposing protesters meet dallas,what happened when black lives matter protest ...,Viral


In [34]:
positive = df[df['Virality'] == 'Viral']
negative = df[df['Virality'] == 'Not Viral']

In [35]:
fig = px.histogram(df, x="Virality")
fig.update_traces(marker_color="indianred",marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5)
fig.update_layout(title_text='Virality')
fig.show()

In [36]:
# random split train and test data
index = df.index
df['random_number'] = np.random.randn(len(index))
train = df[df['random_number'] <= 0.8]
test = df[df['random_number'] > 0.8]

In [37]:
# count vectorizer:
from sklearn.feature_extraction.text import CountVectorizer
#train.description.fillna('')
#test.description.fillna('')
train.name.fillna('')
test.name.fillna('')
train.message.fillna('')
test.message.fillna('')
train.Virality.fillna('')
test.Virality.fillna('')

3            Viral
9            Viral
11           Viral
18           Viral
28           Viral
           ...    
50340    Not Viral
50341    Not Viral
50344    Not Viral
50353    Not Viral
50354    Not Viral
Name: Virality, Length: 10641, dtype: object

In [38]:
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train['message'].values.astype('U'))
test_matrix = vectorizer.transform(test['message'].values.astype('U'))

In [39]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [40]:
X_train = train_matrix
X_test = test_matrix
y_train = train['Virality']
y_test = test['Virality']

In [41]:
lr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [42]:
predictions = lr.predict(X_test)

In [43]:
# find accuracy, precision, recall:
from sklearn.metrics import confusion_matrix,classification_report
new = np.asarray(y_test)
confusion_matrix(predictions,y_test)

array([[3939, 1338],
       [1307, 4057]])

In [44]:
print(classification_report(predictions,y_test))

              precision    recall  f1-score   support

   Not Viral       0.75      0.75      0.75      5277
       Viral       0.75      0.76      0.75      5364

    accuracy                           0.75     10641
   macro avg       0.75      0.75      0.75     10641
weighted avg       0.75      0.75      0.75     10641

