In [19]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import math
import re
import heapq

In [20]:
# Read data into pandas and assign the return dataframe to data variable
data = pd.read_csv("Tweets.csv")

In [21]:
# This method shows us the first entries in the dataframe
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [22]:
# Columns to be removed
drop_columns = ["tweet_id", "airline_sentiment_gold", "negativereason_gold", "retweet_count", "tweet_coord", "tweet_location", "user_timezone", "tweet_created"]

# Remove columns we dont care about for our assignment
data.drop(drop_columns, inplace = True, axis = 1, errors = "ignore")

# Refresh head to see current dataset
data.head()

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,name,text
0,neutral,1.0,,,Virgin America,cairdin,@VirginAmerica What @dhepburn said.
1,positive,0.3486,,0.0,Virgin America,jnardino,@VirginAmerica plus you've added commercials t...
2,neutral,0.6837,,,Virgin America,yvonnalynn,@VirginAmerica I didn't today... Must mean I n...
3,negative,1.0,Bad Flight,0.7033,Virgin America,jnardino,@VirginAmerica it's really aggressive to blast...
4,negative,1.0,Can't Tell,1.0,Virgin America,jnardino,@VirginAmerica and it's a really big bad thing...


## Features

<br>

**airline_sentiment:** Airline sentiment

Expected type: Three possible string values (positive, neutral, negative)

<br>

**airline_sentiment_confidence:** Confidence score of airline entiment

Expected type: A floating point number from 0 - 1 

<br>

**negativereason:** Reason for negative sentiment

Expected type: A short commentary of the reason for the negative sentiment score

<br>

**negativereason_confidence:** Confidence score for the negative reason of the airline sentiment

Expected type: A floating point number from 0-1 

<br>

**airline:** Airline name

Expected type: String

<br>

**name:** twitter user name for the tweet in question

Expected type: String

<br>

**text:** the text content of the tweet

Expected type: String


In [23]:
# The method isnull is supposed to take all the empty fields or fields with n/a or no value and change them to NaN fields
data["airline_sentiment"].isnull()
data["airline_sentiment_confidence"].isnull()
data["negativereason"].isnull()
data["negativereason_confidence"].isnull()
data["airline"].isnull()
data["name"].isnull()
data["text"].isnull()

data.head()

# Possible TODO: Check that all null-values are converted 
# also change values that do not have information like ("can't tell")

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,name,text
0,neutral,1.0,,,Virgin America,cairdin,@VirginAmerica What @dhepburn said.
1,positive,0.3486,,0.0,Virgin America,jnardino,@VirginAmerica plus you've added commercials t...
2,neutral,0.6837,,,Virgin America,yvonnalynn,@VirginAmerica I didn't today... Must mean I n...
3,negative,1.0,Bad Flight,0.7033,Virgin America,jnardino,@VirginAmerica it's really aggressive to blast...
4,negative,1.0,Can't Tell,1.0,Virgin America,jnardino,@VirginAmerica and it's a really big bad thing...


In [24]:
# Split data into training and test sets



In [56]:
def text_cleanup(text):
    
    # make all tweets lowercase
    text = text.lower()
    
    # remove receipent of tweet
    remove_receipent = re.sub(r"@+\w*","", text)
    
    # remove links
    remove_links = re.sub(r"https*:\/{2}\w.*\/\w*", "", remove_receipent)
    
    # remove special characters
    remove_special_chars = re.sub(r"[^a-z ]", "", remove_links)
    
    # remove extra whitspace
    remove_extra_whitespace = re.sub(r" {1,}", " ", remove_special_chars)
    
    # remove single alphabetical characters
    output = re.sub(r"\b[b-z]\b", "", remove_extra_whitespace)
    
    # remove trailing and leading whitespaces and return
    return output.strip()
    

In [70]:
word_corpus =  []
sentence_corpus = []

# Iterates each line of the tweets-text in the dataset and formats
for tweets_text in data.text:
    # Takes tweet and removes everything except main text 
    clean_text = text_cleanup(tweets_text)
    
    # Sentences
    sentences = clean_text.split(" ")
    sentence_corpus.append(sentences)
    
    # Words
    for word in sentences: 
        word_corpus.append(word)
print(corpus)
    

    



In [40]:
# count frequency of words and make dictionary with word and word count
word_frequency = {}

for sentence in sentence_corpus:
    for word in sentence: 
        if len(word) < 2:
            continue
        if word not in word_frequency.keys():
            word_frequency[word] = 1
        else:
            word_frequency[word] += 1
            
            

# Construct dataframe from dictionary data 
word_freq_dataframe = pd.DataFrame.from_dict(word_frequency, orient = "index", columns = ["frequency"])

# Sorted dataframe
sorted_dataframe = word_freq_dataframe.sort_values(by=["frequency"], ascending = False)


In [None]:
# Compute the probability of a word being in a sentence
# P(Word|Sentence) = 
# P(Sentence) * P(Word) / P(Sentence)
 


In [72]:
# filter dataframe to only include words with over 50 mentions

filtered_dataframe = sorted_dataframe["frequency"] >= 50

df = sorted_dataframe[filtered_dataframe]

display(df)

Unnamed: 0,frequency
to,8642
the,6050
you,4106
for,3992
flight,3893
on,3772
and,3707
my,3271
is,2824
in,2529


Big-C (C) = nested corpus
Class (c) = Yn or output variable, e.g. sentences
Bag-of-words(d) = x or input variable, e.g our entire corpus 

In our dataset there is a set of sentences and a set with all words from all sentences(bag-of-words.

The bag-of-words is the document (d) or (doc) of our naive bayes classifier

The sentences are the classes (c) of our naive bayes classifier

The features of our dataset is the existence of a word in the bag-of-words

To compute the likelyhood of a word being in the bag of words we first count the number of times the word appears in the bag-of-words and divide by the total number of words being in the bag-of-words


In [29]:
###########

# TODO: Calculate probability of:
# 1. A given word in my bag-of-words being in the sample set of sentences
# 2. A given word being in a sentence overall
     #P(class|data) = (P(data|class) * P(class)) / P(data)
# Add airline_name as topic in a dictionary entry

###########