In [1]:
# Analyzing Political Tweets on a Depression Prediction ML Model
### Sam Spell, James Tipton

#Political rhetoric and discussions have seemingly become more polarized recently. In history and while reaching adulthood, being able to vote and be a part of politics is a very important role in a stable and healthy society. This project aims to use machine learning to develop a model to predict depression based on a string of text from twitter. Once this model is developed, it can be used to conduct an analysis on political messages sent online. We will be able to draw out patterns in twitter texts that the machine learning model classifies as showing signs of Depression. Another goal of this machine learning model is to extract patterns of text that can be connected to patterns of political messaging if they exist, and to compare this to a temporal aspect. With the changing view on polarized politics, it will be interesting to test if there is a change in the prevalence of messages classified with “depression” throughout different political times.


#### Step 1: Clean the datasets to prepare for the model
#Import libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import numpy as np
from numpy import savetxt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer


In [8]:

#run these downloads once

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

#filter for stopwords

# isolate text column of dataset
d = pd.read_csv("depression.csv")
p = pd.read_csv("IMDB_Dataset.csv")
#comb = p["Title"].fillna('') +  ' ' + p['Text'].fillna('')
#comb2 = comb
text = d["clean_text"]
review = p['review']
review_origin = review

# determine stopwords
stop_words = set(stopwords.words('english'))

# define function to remove stopwords
def remove_stopwords(text):
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    filtered_text = " ".join(filtered_tokens)
    return filtered_text

text = text.apply(remove_stopwords)
review = review.apply(remove_stopwords)
#comb = comb.apply(remove_stopwords)

#lemmatize and stem each reddit post in the dataset

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = nltk.word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    lemmatized_text = " ".join(lemmatized_tokens)
    return lemmatized_text

text = text.apply(lemmatize_text)
review = review.apply(lemmatize_text)

stemmer = PorterStemmer()

def stem_text(text):
    tokens = nltk.word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    stemmed_text = " ".join(stemmed_tokens)
    return stemmed_text

text = text.apply(stem_text)
review = review.apply(stem_text)
#print(comb.head())
#print(comb2.head())

#cleaned text


In [10]:
print(review)
print(review_origin)

0        one review mention watch 1 oz episod 'll hook ...
1        wonder littl product . < br / > < br / > film ...
2        thought wonder way spend time hot summer weeke...
3        basic 's famili littl boy ( jake ) think 's zo...
4        petter mattei 's `` love time money `` visual ...
                               ...                        
49995    thought movi right good job . n't creativ orig...
49996    bad plot , bad dialogu , bad act , idiot direc...
49997    cathol taught parochi elementari school nun , ...
49998    'm go disagre previou comment side maltin one ...
49999    one expect star trek movi high art , fan expec...
Name: review, Length: 50000, dtype: object
0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
             

In [11]:

x_train, x_test, y_train, y_test = train_test_split(text, d['is_depression'], test_size=0.33, random_state=42)

# convert phrases into numerical vectors using TF-IDF
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

# train SVM model
clf = SVC(kernel='linear', C=1.0)
clf.fit(x_train, y_train)

# evaluate SVM model
y_pred = clf.predict(x_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))



Accuracy: 0.950626959247649
Precision: 0.949085123309467
Recall: 0.950597609561753
F1 score: 0.9498407643312102


In [13]:


#x_new = vectorizer.transform(new_text)
x_new = vectorizer.transform(review)
y_new_pred = clf.predict(x_new)

print(y_new_pred)

print(f"{(sum(y_new_pred) / len(review)) * 100:.2f}%")

print(len(y_new_pred))
print(sum(y_new_pred))




[1 0 0 ... 1 1 0]
38.73%
50000
19364


In [18]:
for name in p['sentiment'].unique():
    print(str(name) + " Count")
    count = 0
    name_cnt = 0
    for i in range(len(y_new_pred)):
        if p.iloc[i, 1] == name:
            name_cnt += 1
            if y_new_pred[i] == 1:
                count += 1
    print()
    print(str(count) + " out of " + str(name_cnt))
    print(f"{(count / name_cnt) * 100:.2f}%")
    print()
    print()
    
'''
for i in range(len(y_new_pred)):
    if y_new_pred[i] == 1:
        if(p.loc[i,'Political Lean'] == "Conservative"):
            conserv_count = conserv_count + 1
        elif(p.loc[i,'Political Lean'] == "Liberal"):
            liberal_count = liberal_count + 1
            '''
#print("Conservative Count")            
#print(str(conserv_count) + " out of " + str(len(p[p['Political Lean'] == 'Liberal'])))
#print(f"{conserv_count / len(p[p['Political Lean'] == 'Liberal']) * 100:.2f}%")
##print("Liberal Count")
#print(str(liberal_count) + " out of " + str(len(p[p['Political Lean'] == 'Conservative'])))
#print(f"{liberal_count / len(p[p['Political Lean'] == 'Conservative']) * 100:.2f}%")


positive Count

9332 out of 25000
37.33%


negative Count

10032 out of 25000
40.13%




'\nfor i in range(len(y_new_pred)):\n    if y_new_pred[i] == 1:\n        if(p.loc[i,\'Political Lean\'] == "Conservative"):\n            conserv_count = conserv_count + 1\n        elif(p.loc[i,\'Political Lean\'] == "Liberal"):\n            liberal_count = liberal_count + 1\n            '