In [12]:
import pandas as pd
import nltk
from nltk.corpus import stopwords

In [13]:
df = pd.read_csv('/Users/owlthekasra/Documents/Code/Python/WikiNLP/Sentiment_Analysis_Dataset.csv', encoding="ISO-8859-1")
# df = df.iloc[0::100, :] # make training set easier to work with
df = df[["Sentiment", "SentimentSource","SentimentText"]].reset_index().iloc[:,1:]


  interactivity=interactivity, compiler=compiler, result=result)


In [14]:
#%% Separate dataframe into positive and negative sentiment observations
positive= df[df["Sentiment"]==1]["SentimentText"]
negative = df[df["Sentiment"]==0]["SentimentText"]

In [15]:
#%% Remove stop words to find meaningful word frequencies
stop_words = set(stopwords.words('english'))
added_words = ["-", "get", "going", "go", "I'm", "im", "u","know", "&amp;", "got", "I'll", "@", "that's", "like", "really", "one", "...", "..", "2", "?", "&lt;3","see"]
stop = list(stop_words)
stop.extend(added_words)
stop = set(stop)

In [16]:
#%% functions to get determine frequency of words
def wordListToFreqDf(wordlist):
    wordfreq = [wordlist.count(p) for p in wordlist]
    return pd.DataFrame(list(zip(wordlist,wordfreq)))

def getMostFrequent(posneg):
    temp = [wrd for sub in posneg for wrd in sub.split()]
    filt = [w for w in temp if not w.lower() in stop]
    freqdf = wordListToFreqDf(filt)
    vals = freqdf.sort_values(by=1,ascending=False)
    freq = vals.groupby(0).count().sort_values(by=1, ascending=False)
    return freq

In [None]:
#%% find most positive and negative words
pos_freq = getMostFrequent(positive).reset_index()
neg_freq = getMostFrequent(negative).reset_index()

pos_freq = pos_freq[pos_freq[0]!="I'm"].reset_index().iloc[:,1:]
neg_freq = neg_freq[neg_freq[0]!="I'm"].reset_index().iloc[:,1:]

top_10_pos = pos_freq.iloc[0:10,0:2]
top_10_neg = neg_freq.iloc[0:10,0:2]

In [11]:
#%%
top_10_pos['rank'] = top_10_pos.index + 1
top_10_pos['zipf'] = top_10_pos[1] * top_10_pos['rank']

top_10_neg['rank'] = top_10_neg.index + 1
top_10_neg['zipf'] = top_10_neg[1] * top_10_neg['rank']

# when multiplying the count by the rank, 
# you do not approach a constant
# which may or may not be due to removing stop words

In [None]:
#%%
import matplotlib.pyplot as plt
import seaborn as sns

sns.lineplot(top_10_pos[1], top_10_pos['rank'])
sns.lineplot(top_10_neg[1], top_10_neg['rank'])

# The inverse proportionality part of zipf law is true, 
# as you can see a vaguely 1/x to -x graph in the plot 

In [None]:
#%% Machine Learning Classification

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier 
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
#%% Getting rid of symbols from text
## from https://stackabuse.com/text-classification-with-python-and-scikit-learn
from nltk.stem import WordNetLemmatizer
import re

documents = []
stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)

In [None]:
#%% Create data and class sets
X, y = df['SentimentText'], df['Sentiment']


In [None]:
#%% Vectorize text
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stop)
X = tfidfconverter.fit_transform(documents).toarray()


In [None]:
#%% split into training and test sets (Test = 1/10th of entire set)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.1, random_state = 42)

In [None]:
#%% Test on Random Forest
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 

y_pred = classifier.predict(X_test)

acc_rf = accuracy_score(y_pred, y_test) 

cv_score_rf = cross_val_score(RandomForestClassifier(n_estimators=1000, random_state=0), X, y, scoring='accuracy', cv=10)

In [None]:
#%% Test on Naive Bayes Classifier
clf = MultinomialNB().fit(X_train, y_train)
y_pred_clf = clf.predict(X_test)

acc_nb = accuracy_score(y_pred_clf, y_test)
cv_score_nb = cross_val_score(MultinomialNB(), X, y, scoring='accuracy', cv=10)

In [None]:
#%% Check Kappa
def getKappa(test, pred):
    cm = confusion_matrix(test, pred)
    num = 0
    denom = 0
    obs = 0
    for i in range(0,len(cm)):
        num = num + (sum(cm[i])*sum(cm[:,i]))
        denom = denom+sum(cm[i])
        obs = obs + cm[i,i]
    expected = num/denom
    kappa = (obs - expected)/(denom - expected)
    return kappa

kappa_forest = getKappa(y_test,y_pred)
kappa_np = getKappa(y_test,y_pred_clf)