### Load Data
Load the CSV file into three classes of positive, neutral and negative

In [932]:
import csv


class Positive:
    def __init__(self, text, sentiment):
        self.text = text
        self.sentiment = sentiment      
        
class Neutral:
    def __init__(self, text, sentiment):
        self.text = text
        self.sentiment = sentiment
        
class Negative:
    def __init__(self, text, sentiment):
        self.text = text
        self.sentiment = sentiment
    

positive = []
neutral = []
negative = []


with open("Tweets.csv", encoding="utf-8") as r:
    reader = csv.DictReader(r)
    for linje in reader:
        if linje["airline_sentiment"] == "positive":
            positive.append(Positive(linje["text"], linje["airline_sentiment"]))
        elif linje["airline_sentiment"] == "neutral":
            neutral.append(Neutral(linje["text"], linje["airline_sentiment"]))
        else:
            negative.append(Negative(linje["text"], linje["airline_sentiment"]))

### Prepare Data
Split up data into train and test set

In [933]:
import random

random.shuffle(positive)
random.shuffle(neutral)
random.shuffle(negative)

train_pos = positive[:2000]
train_neu = neutral[:2000]
train_neg = negative[:2000]

test_pos = positive[2000:]
test_neu = neutral[2000:]
test_neg = negative[2000:]

test_tweets = test_pos + test_neu + test_neg
random.shuffle(test_tweets)

### Bag of Words 
Vectorize words and create words probability

In [934]:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import sys
numpy.set_printoptions(threshold=sys.maxsize)

pos_vectorizer = CountVectorizer(max_df = 0.95, stop_words = stopwords.words('english'))
neu_vectorizer = CountVectorizer(max_df = 0.95, stop_words = stopwords.words('english'))
neg_vectorizer = CountVectorizer(max_df = 0.95, stop_words = stopwords.words('english'))

    
train_pos_vectors = pos_vectorizer.fit_transform(x.text for x in train_pos)
train_neu_vectors = neu_vectorizer.fit_transform(x.text for x in train_neu)
train_neg_vectors = neg_vectorizer.fit_transform(x.text for x in train_neg)

##### Lager to funksjoner. En til å beregne sannsynlighet og den andre til å multiplisere alle tall i en liste

In [935]:
def sentiment_probability(dataframe):
    temp_dict = {}
    for column in dataframe:
        temp_dict[column] = (dataframe[column].sum() / dataframe.values.sum())
    return temp_dict


def multiply_list(liste):
    i = 1
    for element in liste:
        i *= element
    return i

### Create Naive Bayes Classifier

In [936]:
class NaiveBayes:
    
    pos_dict = {}
    neu_dict = {}
    neg_dict = {}
    
    def __init__ (self):
        pass
    
    def fit(self, train_pos_vectors, train_neu_vectors, train_neg_vectors):
        
        pos = pd.DataFrame(train_pos_vectors.toarray(), columns=pos_vectorizer.get_feature_names())
        neu = pd.DataFrame(train_neu_vectors.toarray(), columns=neu_vectorizer.get_feature_names())
        neg = pd.DataFrame(train_neg_vectors.toarray(), columns=neg_vectorizer.get_feature_names())
        
        self.pos_dict = sentiment_probability(pos)
        self.neu_dict = sentiment_probability(neu)
        self.neg_dict = sentiment_probability(neg)
        
    def predict(self, sentence, generator = True):
        temp_pos = []
        temp_neu = []
        temp_neg = []
        
        total_words = 1 / (len(self.pos_dict) + len(self.neu_dict) + len(self.neg_dict))
        sentence = word_tokenize(sentence.lower())
        
        for word in sentence:
            if word in list(self.pos_dict.keys()):
                temp_pos.append(self.pos_dict[word] + total_words)
            else:
                temp_pos.append(total_words)
                
        for word in sentence:
            if word in list(self.neu_dict.keys()):
                temp_neu.append(self.neu_dict[word] + total_words)
            else:
                temp_neu.append(total_words)
                
        for word in sentence:
            if word in list(self.neg_dict.keys()):
                temp_neg.append(self.neg_dict[word] + total_words)
            else:
                temp_neg.append(total_words) 
        
        if (multiply_list(temp_neg) > multiply_list(temp_pos)) and (multiply_list(temp_neg) > multiply_list(temp_neu)):
            prediction = "negative"
            key_word = sentence[temp_neg.index(max(temp_neg))]
            
        elif (multiply_list(temp_neu) > multiply_list(temp_pos)) and (multiply_list(temp_neu) > multiply_list(temp_neg)):
            prediction = "neutral"
            key_word = sentence[temp_neu.index(max(temp_neu))]
            
        elif (multiply_list(temp_pos) > multiply_list(temp_neu)) and (multiply_list(temp_pos) > multiply_list(temp_neg)):
            prediction = "positive"
            key_word = sentence[temp_pos.index(max(temp_pos))]
            
        if generator == True: 
            print("Because of the word: " + str(key_word) + "\nOur classifier predicts:", prediction)
            return prediction
        else:
            return prediction
        
    def evaluate(self, test_tweets):
        test_score = []
        pos_score = []
        neu_score = []
        neg_score = []
        
        for objekt in test_tweets:
            if self.predict(objekt.text, False) == objekt.sentiment: #Setter generator til False, så vi ikke får ut tekst
                test_score.append(1)
            else:
                test_score.append(0)
                
        return (sum(test_score)/len(test_score))
        

In [937]:
NB = NaiveBayes() #Initaliserer modellen

In [938]:
NB.fit(train_pos_vectors, train_neu_vectors, train_neg_vectors) #kjører treningsdataen gjennom naive bayes

In [939]:
NB.evaluate(test_tweets) #tester modellen opp mot testdataen

0.7545138888888889

In [None]:
NB.predicts() #Skriv inn tekst å teste 