In [29]:
import numpy as nb
import pandas as pd
import pandas_profiling
import nltk as nltk
import re
from string import punctuation
from nltk.corpus import stopwords
from nltk import sent_tokenize
from sklearn.model_selection import train_test_split

In [30]:
#Leser inn via pandas read
data_frame = pd.read_csv("tweets.csv") 

In [31]:
#Splitter dataframen til 2 frames
training_data_frame, test_data_frame = train_test_split(data_frame, test_size=0.2)

In [32]:
#Setter alt til lower
training_data_frame.loc[:, ('text')] = training_data_frame.loc[:, ('text')].str.lower()
#print(training_data_frame["text"])

In [33]:
#Fjerner alt utenom tekst
training_data_frame.loc[:, ('text')]=training_data_frame.loc[:, ('text')].replace({"[^a-zA-Z ]":''}, regex = True)
#print(training_data_frame.loc[:, ('text')])

In [34]:
#Tokenizer alle ordene i tweets.
training_data_frame.loc[:, ('text')] = training_data_frame.loc[:, ('text')].apply(nltk.word_tokenize)
#NB: må kjøres før stopwords er applied

In [35]:
#Applier stop words
stop_words = set(stopwords.words('english')) 
#training_data_frame.loc[:, ('text')] = training_data_frame.loc[:, ('text')].apply(lambda x: [item for item in x if item not in stop_words])

In [11]:
#Denne kalles på en tweet som kommer inn i min naive bayes og renser tweeten.
def the_cleaner(tweet):
    #if type(tweet) == list:
        #return
    tokens = tweet.split()
    tokens = [t.translate(str.maketrans('', '', punctuation)) for t in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word.lower() for word in tokens]
    tokens = [word for word in tokens if word not in stop_words]
    return tokens


In [12]:
#Lager 3 lister med positive, neutrale og negative tweets.
pos_tweets = []
neut_tweets = []
neg_tweets = []

for i in range(len(training_data_frame)):
    if (training_data_frame.iloc[i]["airline_sentiment"] == "positive"):
        pos_tweets.append(training_data_frame.iloc[i]["text"])
        
for i in range(len(training_data_frame)):
    if (training_data_frame.iloc[i]["airline_sentiment"] == "neutral"):
        neut_tweets.append(training_data_frame.iloc[i]["text"])
        
for i in range(len(training_data_frame)):
    if (training_data_frame.iloc[i]["airline_sentiment"] == "negative"):
        neg_tweets.append(training_data_frame.iloc[i]["text"])


In [14]:
#kalkuler forekomst av ord i de forksjellige listene
def freq_calc(incoming_list):
    local_dict = {}
    for x in incoming_list:
        for y in x:
            if y not in local_dict.keys():
                local_dict[y] = 1
            else:
                local_dict[y] += 1
    return local_dict

pos_freq = freq_calc(pos_tweets) 
neut_freq = freq_calc(neut_tweets)
neg_freq = freq_calc(neg_tweets)


In [15]:
#kalkulerer sannsynligheten for at et ord innen for positive, neutral eller negative.
def occurance_sentiments(word_freq):
    occurance_list = {}
    for x in word_freq.keys():
        temp = word_freq[x]/sum(word_freq.values())
        occurance_list[x] = temp
    return occurance_list

#Her lages listene for forekomst
pos_occurance = occurance_sentiments(pos_freq)
neut_occurance = occurance_sentiments(neut_freq)
neg_occurance = occurance_sentiments(neg_freq)


In [16]:
#Finner ut hvor sannsynlig det er at en tweet er positiv, neutral eller negative.
def probability_of_tweets_sentiment(total_tweets, sentiment_amount):
    return sentiment_amount/total_tweets 

#Her lagres verdiene
pos_probability = probability_of_tweets_sentiment(training_data_frame.loc[:, ('text')].count(), len(pos_tweets))
neut_probability = probability_of_tweets_sentiment(training_data_frame.loc[:, ('text')].count(), len(neut_tweets))
neg_probability = probability_of_tweets_sentiment(training_data_frame.loc[:, ('text')].count(), len(neg_tweets))


In [17]:
#Her lages en dictionary med unike ord og forekomsten av det i alle tweets
word_freq = {}
for tweet in training_data_frame.loc[:, ('text')]:
    for word in tweet:
        if word not in word_freq.keys():
            word_freq[word] = 1
        else:
            word_freq[word] += 1


In [18]:
#Her beregens ordets forekomst i alle sentimenter
def occurance_in_all_sentiments(occurance_list):
    oias = {}
    for word in occurance_list:
        if word not in oias:
            oias[word] = occurance_list[word]/sum(word_freq.values())
    return oias

total_occurance_list = occurance_in_all_sentiments(word_freq)


In [22]:
def from_scratch_naive_bayes(tweet):
    temp = the_cleaner(tweet) #rense metode for tweeten
    pos_bayes = 0
    neut_bayes = 0
    neg_bayes = 0
    pos_neut_neg_value = 0
    occ_value = 0
    what_to_return = []
    
    for word in temp:
        if word not in word_freq:
            word = 1/len(word_freq.keys())
            #Task 8: hvis ordet ikke finnes i vokabulæret så vil den bli omgjort til 1 delt på lengden av unike ord
        for positive_word in pos_occurance:
            #Her matcher vi ordet med dens forekomst i positive tweets 
            if word == positive_word:
                pos_bayes += (pos_occurance[word] * pos_probability) / total_occurance_list[word]
                #Her regnes ut naive bayes for ordet i en positiv sammenheng, det samme gjøres for neutral og
                #og negative i for løkkene under
        
        for neutral_word in neut_occurance:
            if word == neutral_word:
                neut_bayes += (neut_occurance[word] * neut_probability) / total_occurance_list[word]
        
        for negative_word in neg_occurance:
            if word == negative_word:
                neg_bayes += (neg_occurance[word] * neg_probability) / total_occurance_list[word]
                #print(neg_occurance[word], neg_probability, total_occurance_list[word])
    
    what_to_return.append(pos_bayes/len(temp))
    what_to_return.append(neut_bayes/len(temp))
    what_to_return.append(neg_bayes/len(temp))
    #Her deles de forskjellige summene vi får på lengden av tweeten og legges i en liste, det størte tallet
    #vil ble valgt av if setningen under. Siden følgen av tallene alltid er positiv, neutral og negativ kan vi
    #returne en string basert på posisjonen som blir returnert.
    
    if what_to_return.index(max(what_to_return))== 0: return "positive"
    if what_to_return.index(max(what_to_return))== 1: return "neutral"
    else: return "negative"
    

In [52]:
def score(test_data_frame):
    outer_temp = []
    total_score = 0
    for i in range(len(test_data_frame)):
        if (test_data_frame.iloc[i]["airline_sentiment"] == from_scratch_naive_bayes(test_data_frame.iloc[i]["text"])):
            total_score += 1
            #Her sammenlignes det som kommer tilbake fra bayes med det som står i sentimentet, og hvis det er likt
            #så plusses det 1 på total_score. Scoren blir deretter delt på lengden av data_framen for å få
            #en nøykatighets score.
    return total_score/len(test_data_frame)
        
        
#Her kalles score med test_data_frame.
print("The accurcy score is: " + str(score(test_data_frame)))


The accurcy score is: 0.7547814207650273


In [53]:
#Task 9, this is a basic module that can take input.
def tweet_input():
    user_input = input("Please enter your desired tweet")
    output = from_scratch_naive_bayes(user_input)
    print("Your bayes output is as follow: "+output)
    
tweet_input()


Please enter your desired tweethappy good yay i like
Your bayes output is as follow: positive


In [107]:
def from_scratch_naive_bayes_with_explanation(tweet):
    temp = the_cleaner(tweet) #rense metode for tweeten
    pos_bayes = 0
    neut_bayes = 0
    neg_bayes = 0
    pos_neut_neg_value = 0
    occ_value = 0
    what_to_return = []
    induvidually_words = {}
    
    for word in temp:
        if word not in word_freq:
            word = 1/len(word_freq.keys())
            
        for positive_word in pos_occurance:
            if word == positive_word:
                pos_bayes += (pos_occurance[word] * pos_probability) / total_occurance_list[word]
        
        for neutral_word in neut_occurance:
            if word == neutral_word:
                neut_bayes += (neut_occurance[word] * neut_probability) / total_occurance_list[word]
        
        for negative_word in neg_occurance:
            if word == negative_word:
                neg_bayes += (neg_occurance[word] * neg_probability) / total_occurance_list[word]
    
    what_to_return.append(pos_bayes/len(temp))
    what_to_return.append(neut_bayes/len(temp))
    what_to_return.append(neg_bayes/len(temp))
    
    for i in range(len(what_to_return)):
        if i == 0:
            induvidually_words["positive naive bayes probablity"]=what_to_return[i]
        if i == 1:
            induvidually_words["neutral naive bayes probablity"]=what_to_return[i]
        if i == 2:
            induvidually_words["negative naive bayes probablity"]=what_to_return[i]
            
    if what_to_return.index(max(what_to_return))== 0: return "positive", induvidually_words
    if what_to_return.index(max(what_to_return))== 1: return "neutral", induvidually_words
    else: return "negative", induvidually_words
    #Dette er en kopi av min første naive bayes metode, bare at den returnerer mer informasjon om en tweet.
    #Den returnerer naive bayes sansynligheten innen for positive, neutral og negative og i tilegg hva svaret den
    #den gir er.

In [106]:
#Task 10 explaination generator
def explainator():
    print(from_scratch_naive_bayes_with_explanation("horrible"))
explainator()


('negative', {'positive naive bayes probablity': 0.04017800351063945, 'neutral naive bayes probablity': 0.05825095002325476, 'negative naive bayes probablity': 0.8244998479596466})


In [None]:
#Taks 11. Pick two correctly and two incorrectly predicted tweet
#pd.set_option('display.max_rows', None)
#display(test_data_frame)
'''
False positive, should be neutral: @USAirways Good morning.
By looking at the weighting of the word "good" and that this tweet is short, the word tips the tweet sentiment to 
positive. It should be neutral is i does not give any positive feedback, and is just saying hello.

False neutral, should be positive: @united Great, thank you!! I'll send it now. 
My earlier explanation on this short tweet goes here, the few words that goes by the stop word sends it to natural 
on how the words weight on the occurance scale.

True negative: @united I will but right now I'm to angry 
True negative: @united So disappointed in the service and the... 
Both of these contains words that are frequent negative, so it makes sense that they will be negative.
'''