### Open a Terminal and run the following commands:

##### Create a directory for NLTK data
mkdir -p ~/nltk_data/corpora

##### Download stopwords manually
curl https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip -o ~/nltk_data/corpora/stopwords.zip

##### Download wordnet manually
curl https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/wordnet.zip -o ~/nltk_data/corpora/wordnet.zip

##### Unzip the downloaded files
unzip ~/nltk_data/corpora/stopwords.zip -d ~/nltk_data/corpora/
unzip ~/nltk_data/corpora/wordnet.zip -d ~/nltk_data/corpora/

In [3]:
%pip install pandas numpy nltk scikit-learn

You should consider upgrading via the '/Users/taurangela/Desktop/Github/Stock-Sentimental-Analysis/env/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


##### Step 1: Importing Libraries

In [18]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

##### Step 2: Set the NLTK Data Path

In [19]:
nltk.data.path.append('/Users/taurangela/nltk_data')

##### Step 3: Reading the Dataset

In [20]:
dataset = pd.read_csv('/Users/taurangela/Desktop/Github/Stock-Sentimental-Analysis/data/Sentiment.csv')
dataset = dataset[['text', 'sentiment']]

In [21]:
dataset

Unnamed: 0,text,sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,Neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,Neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive
...,...,...
13866,RT @cappy_yarbrough: Love to see men who will ...,Negative
13867,RT @georgehenryw: Who thought Huckabee exceede...,Positive
13868,"RT @Lrihendry: #TedCruz As President, I will a...",Positive
13869,RT @JRehling: #GOPDebate Donald Trump says tha...,Negative


##### Step 4: Filtering Out Neutral Sentiments

In [22]:
dataset = dataset[dataset['sentiment'] != 'Neutral']

##### Step 5: Splitting the Dataset into Training and Testing Sets

In [23]:
train, test = train_test_split(dataset, test_size=0.1)

##### Step 6: Cleaning and Preprocessing the Text Data

In [24]:
pattern = "(#\w+)|(RT\s@\w+:)|(http.*)|(@\w+)"
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [25]:
def clean_text(data):
    tweets = []
    sentiments = []
    for index, row in data.iterrows():
        sentence = re.sub(pattern, '', row.text)
        words = [e.lower() for e in sentence.split()]
        words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]
        words = ' '.join(words)
        tweets.append(words)
        sentiments.append(row.sentiment)
    return tweets, sentiments

train_tweets, train_sentiments = clean_text(train)

##### Step 7: Creating a DataFrame for the Processed Data

In [26]:
processed_data = pd.DataFrame({'tweets': train_tweets, 'sentiments': train_sentiments})

##### Step 8: Encoding Sentiments as Numerical Values

In [27]:
labelencoder = LabelEncoder()
processed_data['sentiments'] = labelencoder.fit_transform(processed_data['sentiments'])

##### Step 9: Converting Words into Vectors using CountVectorizer

In [28]:
cv = CountVectorizer(ngram_range=(1, 3))
cv.fit(processed_data['tweets'])
X_train = cv.transform(processed_data['tweets'])


##### Step 10: Model Building (Sentiment Analysis using Naive Bayes)

In [30]:
classifier = MultinomialNB()
classifier.fit(X_train.toarray(), processed_data['sentiments'])

##### Step 11: Preprocessing the Test Data

In [31]:
test_tweets, test_sentiments = clean_text(test)
final_test_data = pd.DataFrame({'tweets': test_tweets, 'sentiments': test_sentiments})
X_test = cv.transform(final_test_data['tweets'])

##### Step 12: Predicting Sentiments on the Test Data

In [32]:
y_pred = classifier.predict(X_test.toarray())

##### Step 13: Evaluating the Model

In [33]:
final_test_data['sentiments'] = labelencoder.fit_transform(final_test_data['sentiments'])
accuracy = accuracy_score(y_pred, final_test_data['sentiments'])
print(f"Accuracy: {accuracy}")

Accuracy: 0.8378378378378378


##### Step 14: Displaying the Final Test Data with Predictions


In [34]:
final_test_data

Unnamed: 0,tweets,sentiments
0,godfather american,0
1,wish hilary carly fiorina passion spark! take ...,1
2,call gentleman.,1
3,fox made play womens victim professional. ragi...,0
4,"wait said use say ""when used it?""",0
...,...,...
1068,"huckabee like science threatens god, like circ...",0
1069,"""when actually become republican?""",0
1070,candidate received word god?! presidential hop...,0
1071,look substance need discussion. successful eve...,1


#####