In [32]:
# File:   stmt_engine.py
# Author: Colin Szatkowski
# Runs sentiment analysis on the twitter data using the Naive Bayes model

In [33]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
# Machine Learning Libraries
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC

# Stop word for data preprocessing
stop_words = set(stopwords.words('english'))

In [34]:
# Loads the dataset using utf-8 encoding
def load_dataset_utf(filename, cols):
    dataset = pd.read_csv(filename, encoding='utf-8')
    dataset.columns = cols
    return dataset

In [35]:
# Loads the dataset using latin-1 encoding
def load_dataset_latin1(filename, cols):
    dataset = pd.read_csv(filename, encoding='latin-1')
    dataset.columns = cols
    return dataset

In [36]:
# Deletes columns from the dataset 
def remove_unwanted_cols(dataset, cols):
    for col in cols:
        del dataset[col]
    return dataset

In [37]:
# Preprocesses the tweets to delete urls, @'s, stopwords, etc.
def preprocess_tweet_text(tweet):
    tweet.lower()
    # Remove urls
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    # Remove user @ references and '#' from tweet
    tweet = re.sub(r'\@\w+|\#','', tweet)
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]
    
    # Stems the tweet
    ps = PorterStemmer()
    stemmed_words = [ps.stem(w) for w in filtered_words]

    return " ".join(filtered_words)

In [38]:
# Converts the tokens to numbers using vectorizing
def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector

In [39]:
# Converts integer value of sentiment to more easily readable string form
def int_to_string(sentiment):
    if sentiment == 0:
        return "Negative"
    elif sentiment == 2:
        return "Neutral"
    else:
        return "Positive"

In [40]:
# Load dataset
print("Now loading dataset")
dataset = load_dataset_latin1("../data/tweets/training.csv", ['target', 't_id', 'created_at', 'query', 'user', 'text'])
# Remove unwanted columns from dataset
print("Now removing unwanted columns")
#n_dataset = remove_unwanted_cols(dataset, ['t_id', 'created_at', 'query', 'user'])
n_dataset = remove_unwanted_cols(dataset, ['t_id', 'created_at', 'query', 'user'])
# Preprocess data
print("Now processing unpreprocessed data")
dataset.text = dataset['text'].apply(preprocess_tweet_text)

Now loading dataset


FileNotFoundError: [Errno 2] No such file or directory: 'data/tweets/training.csv'

In [19]:
# Split dataset into train and test sets
print("Now processing preprocessed data")

# Finds the feature vector and splits the sample tweet data set into a training and test split 80:20
tf_vector = get_feature_vector(np.array(dataset.iloc[:, 1]).ravel())
X = tf_vector.transform(np.array(dataset.iloc[:, 1]).ravel())
y = np.array(dataset.iloc[:, 0]).ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

# Naive Bayes model
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)
y_predict_nb = NB_model.predict(X_test)
print("Naive Bayes accuracy score:")
print(accuracy_score(y_test, y_predict_nb))

# Training Logistics Regression model
#LogR_model = LogisticRegression(solver='lbfgs')
#LogR_model.fit(X_train, y_train)
#y_predict_log_r = LogR_model.predict(X_test)
#print("Logistics regression accuracy score:")
#print(accuracy_score(y_test, y_predict_log_r))

# Training Linear Regression model
#print("Starting Linear Regression")
#LinR_model = LinearRegression()
#LinR_model.fit(X_train, y_train)
#y_predict_lin_r = LinR_model.predict(X_test)
#print("Linear regression accuracy score:")
#print(accuracy_score(y_test, y_predict_lin_r))

Now processing preprocessed data
Naive Bayes accuracy score:
0.768521875
Logistics regression accuracy score:
0.7877875


In [31]:
#ticker_name may be changed to generate sentiment analysis for any tweet
ticker_name = "goog"
test_file_name = f"../data/tweets/{ticker_name}_twt.csv"
test_ds = load_dataset_utf(test_file_name, ["id", "date", "class", "percent", "text"])
test_ds = remove_unwanted_cols(test_ds, ["id"])

# Creates text feature
test_ds.text = test_ds["text"].apply(preprocess_tweet_text)
test_feature = tf_vector.transform(np.array(test_ds["text"]).ravel())

# Uses Naive Bayes model for prediction
test_prediction = NB_model.predict(test_feature)

# Exports the results to a .csv file
test_result_ds = pd.DataFrame({'date': test_ds["date"], 'class': test_ds["class"], 'percent': test_ds["percent"], 'sentiment':test_prediction, 'text': test_ds["text"]})
test_result_ds.sentiment = test_result_ds['sentiment'].apply(int_to_string)
test_result_ds.to_csv(f"../data/sentiment/{ticker_name}_stmt.csv", index=True, index_label='id')
#print(test_result_ds)

FileNotFoundError: [Errno 2] No such file or directory: 'CSC_422/tweets/goog_twt.csv'