In [7]:
import pandas as pd
import numpy as np
import re
import string


In [8]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer


In [45]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/jshort/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
# ML Libraries
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [37]:
def load_dataset(filename, cols):
    dataset = pd.read_csv(filename, index_col=False, header=None, sep=',', names=cols)
    dataset.cols = cols
    return dataset


In [63]:
#Remove the columns we don't care about
def remove_unwanted_cols(dataset, cols):
    for col in cols:
        try:
            del dataset[col]
        except:
            print(f'Column {col} already removed')
    return dataset

In [52]:
def preprocess_tweet_text(tweet):
    tweet.lower()
    
    # Remove urls
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    
    # Remove user @ references and '#' from tweet
    tweet = re.sub(r'\@\w+|\#','', tweet)
    
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords
    tweet_tokens = word_tokenize(tweet)
    
    print(tweet_tokens)
    
    filtered_words = [w for w in tweet_tokens if not w in stopwords]
    
    # Stemming is faster than Lemmatizer. 
    # You can uncomment the code and see how results change. 
    # Note: Do not apply both
    
    #ps = PorterStemmer()
    #stemmed_words = [ps.stem(w) for w in filtered_words]
    #lemmatizer = WordNetLemmatizer()
    #lemma_words = [lemmatizer.lemmatize(w, pos='a') for w in stemmed_words]
    
    return " ".join(filtered_words)

In [13]:
def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector

In [14]:
def int_to_string(sentiment):
    if sentiment == 0:
        return "Negative"
    elif sentiment == 2:
        return "Neutral"
    else:
        return "Positive"
    

In [57]:
# Load dataset
dataset = load_dataset("data/testdata.manual.2009.06.14.csv", ['score', 'id', 'created_at', 'query', 'user', 'text'])



  This is separate from the ipykernel package so we can avoid doing imports until


In [64]:
dataset.columns
dataset.head()

Unnamed: 0,score,text
0,4,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,Reading my kindle2... Love it... Lee childs i...
2,4,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,@kenburbary You'll love your Kindle2. I've had...
4,4,@mikefish Fair enough. But i have the Kindle2...


In [65]:
# Remove unwanted columns from dataset
dataset = remove_unwanted_cols(dataset, ['id', 'created_at', 'query', 'user'])


Column id already removed
Column created_at already removed
Column query already removed
Column user already removed


In [66]:
dataset.head()

Unnamed: 0,score,text
0,4,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,Reading my kindle2... Love it... Lee childs i...
2,4,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,@kenburbary You'll love your Kindle2. I've had...
4,4,@mikefish Fair enough. But i have the Kindle2...


In [67]:
#Preprocess data
dataset.text = dataset['text'].apply(preprocess_tweet_text)


['I', 'loooooooovvvvvveee', 'my', 'Kindle2', 'Not', 'that', 'the', 'DX', 'is', 'cool', 'but', 'the', '2', 'is', 'fantastic', 'in', 'its', 'own', 'right']


TypeError: argument of type 'LazyCorpusLoader' is not iterable

In [None]:
# Split dataset into Train, Test

# Same tf vector will be used for Testing sentiments on unseen trending data
tf_vector = get_feature_vector(np.array(dataset.iloc[:, 1]).ravel())
X = tf_vector.transform(np.array(dataset.iloc[:, 1]).ravel())
y = np.array(dataset.iloc[:, 0]).ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

# Training Naive Bayes model
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)
y_predict_nb = NB_model.predict(X_test)
print(accuracy_score(y_test, y_predict_nb))

# Training Logistics Regression model
LR_model = LogisticRegression(solver='lbfgs')
LR_model.fit(X_train, y_train)
y_predict_lr = LR_model.predict(X_test)
print(accuracy_score(y_test, y_predict_lr))