In [94]:
# Import libraries
# DataFrame
import pandas as pd
import numpy as np

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics import classification_report 
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.naive_bayes import BernoulliNB

In [3]:
# Import data source file
reddit_data = pd.read_csv('Reddit_Data.csv')
twitter_data = pd.read_csv('Twitter_Data.csv')

In [4]:
# View Reddit data
reddit_data.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [5]:
# View Twitter data
twitter_data.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [6]:
# Examine data types
reddit_data.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37249 entries, 0 to 37248
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   clean_comment  37149 non-null  object
 1   category       37249 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 9.6 MB


In [7]:
# Examine data types
twitter_data.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162976 non-null  object 
 1   category    162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 32.4 MB


In [8]:
# Check for null values
reddit_data.isnull().sum()

clean_comment    100
category           0
dtype: int64

In [9]:
# Check for null values
twitter_data.isnull().sum()

clean_text    4
category      7
dtype: int64

In [10]:
# Changing the name of twitter_data's "clean_text" to "clean_comment" so that we can join the two dataframes
twitter_data = twitter_data.rename(columns={"clean_text": "clean_comment"})

# Filling NA "category" values with 0 (a neutral value) so we can convert these values to ints
twitter_data['category'] = twitter_data['category'].fillna(0)

# Converting twitter_data's float category values to ints to align with reddit_data
twitter_data['category'] = twitter_data['category'].astype(int)

In [11]:
# Join both datasets & checking work
joined_data = pd.concat([twitter_data, reddit_data])
joined_data.head()

Unnamed: 0,clean_comment,category
0,when modi promised “minimum government maximum...,-1
1,talk all the nonsense and continue all the dra...,0
2,what did just say vote for modi welcome bjp t...,1
3,asking his supporters prefix chowkidar their n...,1
4,answer who among these the most powerful world...,1


In [12]:
# Use drop_duplicates() method to remove duplicates
joined_data = joined_data.drop_duplicates()

In [13]:
# Drop null values & check work
joined_data = joined_data.dropna()
joined_data.isnull().sum()

clean_comment    0
category         0
dtype: int64

In [14]:
# Assign data to X and y variables 
X = joined_data['clean_comment']
y = joined_data['category']

In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 42)  # default train/test split of 70/30, adjust if needed
total_train = len(X_train)      # used to configure number of LDA components - NOT ENOUGH RAM
total_test = len(X_test)

In [16]:
# Generating feature vectors with the use of TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words = ENGLISH_STOP_WORDS)
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [17]:
# Initiate & train the model
model = LinearSVC()
model.fit(X_train, y_train)

LinearSVC()

In [18]:
# Predicting the values & generating classification report 
pred = model.predict(X_test)
report = classification_report(y_test, pred, output_dict = True)
# Checking model accuracy 
print(report['accuracy'])

0.8651923558374364


In [34]:
# Topic modeling using LDA 
topic_model = LatentDirichletAllocation(100, random_state = 42, n_jobs = -1)
topic_model.fit_transform(X_train)
components = topic_model.components_
features = vectorizer.get_feature_names()
topic_dict = {}

# Display prevalent topic for top 100 entries 
for topic_num, component in enumerate(components):
  topics = zip(features, component)
  sorted_topics = sorted(topics, key= lambda x:x[1], reverse=True)[:1]
  for topic in sorted_topics:
    topic_dict.update({topic_num: topic[0]})

print(topic_dict)

# LDA model takes extremely (!) long to converge. Need to find an alternative method to avoid loading an entire corpus into memory



{0: 'modi', 1: 'wtf', 2: 'ambani', 3: 'guarantee', 4: 'press', 5: 'modi', 6: 'modi', 7: 'hai', 8: 'modi', 9: 'modi', 10: 'modi', 11: 'dog', 12: 'mallya', 13: 'wah', 14: 'modi', 15: 'message', 16: 'nirav', 17: 'fulfilled', 18: 'modi', 19: 'accounts', 20: 'thank', 21: 'whatsapp', 22: 'campaigning', 23: 'modi', 24: 'varanasi', 25: 'modi', 26: 'loves', 27: 'best', 28: 'arnab', 29: 'modi', 30: 'modi', 31: 'code', 32: 'modi', 33: 'modi', 34: 'shot', 35: 'modi', 36: 'modi', 37: 'strike', 38: 'modi', 39: 'modi', 40: 'addressing', 41: 'modi', 42: 'modi', 43: 'modi', 44: 'jai', 45: 'proof', 46: 'love', 47: 'modi', 48: 'modi', 49: 'modi', 50: 'andhra', 51: 'shah', 52: 'pic', 53: 'haha', 54: 'modi', 55: 'wins', 56: 'good', 57: 'unemployment', 58: 'elect', 59: 'jaitley', 60: 'modi', 61: 'bro', 62: 'har', 63: 'modi', 64: 'thread', 65: 'vivek', 66: 'modi', 67: 'modi', 68: 'super', 69: 'modi', 70: 'master', 71: 'fan', 72: 'modi', 73: 'addresses', 74: 'space', 75: 'awesome', 76: 'jammu', 77: 'modi', 78

In [35]:
# Most common topics in the sample
from collections import Counter 
counter = Counter(topic_dict.values())
top_3 = counter.most_common(3)
print(top_3)          # most discussed topic: Indian prime minister Narendra Modi

[('modi', 42), ('wtf', 1), ('ambani', 1)]


In [96]:
# Testing an alternative model for sentiment analysis
NB_model = BernoulliNB()
NB_model.fit(X_train, y_train)

BernoulliNB()

In [97]:
# Predicting the values & generating classification report 
NB_pred = NB_model.predict(X_test)
NB_report = classification_report(y_test, NB_pred, output_dict = True)
# Checking model accuracy 
print(NB_report['accuracy'])        # accuracy is substantially higher as opposed to Multinomial NB

0.7246933155303347
