## Importing libraries

In [0]:
import pandas as pd
import numpy as np
import re
import nltk

In [0]:
from nltk import sent_tokenize, word_tokenize
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords 
nltk.download('stopwords')
from nltk.sentiment import SentimentIntensityAnalyzer as sia
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...




True

## Function text cleaning, tokenization, and lematization

In [0]:
string = "@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]"

In [0]:
# function ctsl: Cleaning, Tokenization, Steaming and Lematization

def ctsl(x):
    
    #Cleaning: Obtainig only letters and words
    
    x = str(x.lower())
    words = []
    replace_web = re.sub('http:(.*?)com|^[A-Za-z]+',' ',x)
    extract_words = ' '.join(re.findall('[A-Z,a-z]+',replace_web))
    extract_words = re.sub(',', ' ', extract_words)
    #Tokenization: Seting to a words list format
    
    tokenization = word_tokenize(extract_words)
    
    # Steamming and lematization: Keeping only common terms in simmilar words  
    
    ste_and_lem = [WordNetLemmatizer().lemmatize(SnowballStemmer('english').stem(x)) for x in tokenization]
    
    # Stop words removal
    
    stop_words = set(stopwords.words('english'))
    removed_st = [w for w in ste_and_lem if not w in stop_words]  
    
    return removed_st

In [0]:
print(ctsl(string))

['ironhack', 'q', 'websit']


## Conducting Sentiment Analysis

In [0]:
df = pd.read_csv('Sentiment140.csv', engine='python', encoding='utf-8', error_bad_lines=False).sample(20000)
df.head()

Unnamed: 0,target,id,date,flag,user,text
236161,0,1979927602,Sun May 31 05:03:25 PDT 2009,NO_QUERY,KOLtwitbot,ZADIG ForEVER: ....I just realized I don't kno...
25211,0,1558259660,Sun Apr 19 07:48:40 PDT 2009,NO_QUERY,H_I_M,http://twitpic.com/3lh5q - It Won't Kill You To
1329015,4,2015598936,Wed Jun 03 05:04:28 PDT 2009,NO_QUERY,jiyoonee,@mindows98 awww good luck min btw .. im here!
541214,0,2200006819,Tue Jun 16 18:18:30 PDT 2009,NO_QUERY,Espinoza_,My computer officially doesn't not turn on
516063,0,2190979977,Tue Jun 16 04:13:48 PDT 2009,NO_QUERY,LoveShante,hey twitterboos i'm sick ya'll cramps


In [0]:
df['text_processed'] = df['text'].apply(ctsl)
df.head()

Unnamed: 0,target,id,date,flag,user,text,text_processed
236161,0,1979927602,Sun May 31 05:03:25 PDT 2009,NO_QUERY,KOLtwitbot,ZADIG ForEVER: ....I just realized I don't kno...,"[forev, realiz, know, broken, car, antenna]"
25211,0,1558259660,Sun Apr 19 07:48:40 PDT 2009,NO_QUERY,H_I_M,http://twitpic.com/3lh5q - It Won't Kill You To,"[lh, q, kill]"
1329015,4,2015598936,Wed Jun 03 05:04:28 PDT 2009,NO_QUERY,jiyoonee,@mindows98 awww good luck min btw .. im here!,"[mindow, awww, good, luck, min, btw, im]"
541214,0,2200006819,Tue Jun 16 18:18:30 PDT 2009,NO_QUERY,Espinoza_,My computer officially doesn't not turn on,"[comput, offici, turn]"
516063,0,2190979977,Tue Jun 16 04:13:48 PDT 2009,NO_QUERY,LoveShante,hey twitterboos i'm sick ya'll cramps,"[twitterboo, sick, ya, cramp]"


### Creating Bag of Words

In [0]:
from collections import Counter

df2 = df.copy()
words_list = []

def bag(x):
    for e in x:
        words_list.append(e)
        
df2['text_processed'] = df2['text_processed'].apply(bag)


counts = Counter(words_list).most_common(5000)
counts[:10]

[('go', 1596),
 ('day', 1364),
 ('get', 1330),
 ('wa', 1286),
 ('work', 1006),
 ('like', 1001),
 ('love', 959),
 ('good', 948),
 ('quot', 910),
 ('u', 903)]

In [0]:
bag_words = [i for i,j in counts]
bag_words[:10]

['go', 'day', 'get', 'wa', 'work', 'like', 'love', 'good', 'quot', 'u']

### Building Features

In [0]:
def features(x):
    words = set(x)
    features = {}
    for i in bag_words:
        features[i] = (i in words)
        
    s = sia().polarity_scores(" ".join(x)) # nltk.sentiment.vader.SentimentIntensityAnalyzer: Give a sentiment intensity score to sentences.
    if s["pos"] > 0.2:
        s = True
    else:
        s = False

    return (features, s)

feature = df.text_processed.apply(features)

In [0]:
feature[236161][1]

False

### Building and Traininng Naive Bayes Model

In [0]:
# Link: https://pythonprogramming.net/naive-bayes-classifier-nltk-tutorial/?completed=/words-as-features-nltk-tutorial/

# set that we'll train our classifier with
training_set = feature[:1900]

# set that we'll test against.
testing_set = feature[1900:]

classifier = nltk.NaiveBayesClassifier.train(training_set)

In [0]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

Classifier accuracy percent: 81.82872928176795


In [0]:
classifier.show_most_informative_features(15)

Most Informative Features
                    haha = True             True : False  =     39.6 : 1.0
                    good = True             True : False  =     28.9 : 1.0
                    love = True             True : False  =     27.4 : 1.0
                      ha = True             True : False  =     19.1 : 1.0
                    wish = True             True : False  =     18.5 : 1.0
                  friend = True             True : False  =     14.9 : 1.0
                     yes = True             True : False  =     14.4 : 1.0
                   thank = True             True : False  =     12.3 : 1.0
                    hope = True             True : False  =     11.7 : 1.0
                    play = True             True : False  =     11.2 : 1.0
                    join = True             True : False  =      8.3 : 1.0
                   sweet = True             True : False  =      8.3 : 1.0
                    well = True             True : False  =      7.9 : 1.0