In [40]:
import pandas as pd
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import *
from nltk.stem.porter import *
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from stop_words import get_stop_words
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [41]:
txt = "Ironhack is a Global Tech School ranked num 2 worldwide.   Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do."
analyzer = SentimentIntensityAnalyzer()
analyzer.polarity_scores(txt)



{'neg': 0.0, 'neu': 0.741, 'pos': 0.259, 'compound': 0.8442}

In [59]:
# CARGA DEL DATASET:

In [2]:
sentiment = pd.read_csv('./Sentiment140.csv/Sentiment140.csv')

In [3]:
sentiment.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
dfsample = sentiment.sample(n=500)


In [None]:
# FUNCIONES PARA EL CLEANING DEL TEXTO

In [5]:
def clean_up(s):

    s = re.sub(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', s)

    s = re.sub('[^a-zA-Z_]+', ' ', s)

    s = s.lower()

    return s



def tokenize(s):
    """
    Tokenize a string.

    Args:
        s: String to be tokenized.

    Returns:
        A list of words as the result of tokenization.
    """
    return word_tokenize(s)


def stem_and_lemmatize(l):

    result = []

    for i in l:

        sno = nltk.stem.SnowballStemmer('english')

        x = sno.stem(i)

        lemmatizer = WordNetLemmatizer()

        x = lemmatizer.lemmatize(x)

        result.append(x)

    return result

def remove_stopwords(l):

    stop_words = get_stop_words('en')

    filtered_words = [word for word in l if word not in stop_words]

    return filtered_words


In [None]:
# EJECUCIÓN DE FUNCIONES DE LIMPIEZA DE TEXTO

In [6]:
dfsample['text_processed'] = dfsample['text'].apply(clean_up)


In [7]:
dfsample['text_processed'] = dfsample['text_processed'].apply(tokenize)

In [8]:
dfsample['text_processed'] = dfsample['text_processed'].apply(stem_and_lemmatize)

In [9]:
dfsample['text_processed'] = dfsample['text_processed'].apply(remove_stopwords)

In [10]:
dfsample.head()

Unnamed: 0,target,id,date,flag,user,text,text_processed
771198,0,2302274066,Tue Jun 23 16:34:46 PDT 2009,NO_QUERY,benwablz,Just returned from Funroe...not looking forwar...,"[just, return, funro, look, forward, go, back,..."
339690,0,2014692444,Wed Jun 03 02:22:16 PDT 2009,NO_QUERY,adeyus,needs to open up a lil bit more... sorry baby,"[need, open, lil, bit, sorri, babi]"
114455,0,1826305009,Sun May 17 08:11:25 PDT 2009,NO_QUERY,Lauren574,is going to her grandpas 75th birthday party (...,"[go, grandpa, th, birthday, parti, haha, inste..."
257744,0,1985035567,Sun May 31 16:17:56 PDT 2009,NO_QUERY,ErikaBarros,@taylorswift13 i dont have nbc.. i live in bra...,"[taylorswift, dont, nbc, live, brazil, want, b..."
68286,0,1692776723,Sun May 03 20:45:30 PDT 2009,NO_QUERY,wondrous_as_u,My B Bear is not himself. Nothing so sad as a...,"[b, bear, noth, sad, puppi, s, dump, s, probab..."


In [None]:
# CREACIÓN DEL BAG OF WORDS

In [16]:
words = []
for x in dfsample.text_processed:
    words += x




In [36]:
fdist = FreqDist(words)

voc = fdist.most_common(5000)

bag_of_words = [x[0] for x in voc]


In [None]:
# CREACIÓN DEL DICCIONARIO PARA APLICAR nltk.NaiveBayesClassifier

In [50]:
def find_features(document):
    words = set(document)
    features = {}
    for w in bag_of_words:
        features[w] = (w in words)
    s = SentimentIntensityAnalyzer().polarity_scores(" ".join(document))
    if s["pos"] > 0.2:
        s = True
    else:
        s = False
    return list((features, s))






In [51]:
feature_in_text = dfsample['text_processed'].apply(find_features)
feature_in_text

771198     [{'s': False, 't': False, 'go': True, 'just': ...
339690     [{'s': False, 't': False, 'go': False, 'just':...
114455     [{'s': False, 't': False, 'go': True, 'just': ...
257744     [{'s': False, 't': False, 'go': False, 'just':...
68286      [{'s': True, 't': False, 'go': False, 'just': ...
873475     [{'s': False, 't': False, 'go': False, 'just':...
611451     [{'s': False, 't': False, 'go': False, 'just':...
1239595    [{'s': False, 't': False, 'go': False, 'just':...
1039038    [{'s': False, 't': False, 'go': False, 'just':...
1505898    [{'s': False, 't': False, 'go': False, 'just':...
324504     [{'s': False, 't': True, 'go': False, 'just': ...
602208     [{'s': False, 't': True, 'go': True, 'just': F...
1486925    [{'s': False, 't': False, 'go': False, 'just':...
995884     [{'s': False, 't': False, 'go': False, 'just':...
577495     [{'s': False, 't': False, 'go': True, 'just': ...
841638     [{'s': False, 't': False, 'go': False, 'just':...
175045     [{'s': False,

In [None]:
# CREACIÓN DE TRAINING Y TEST SET

In [52]:
training_set = feature_in_text[:400]
testing_set = feature_in_text[400:]

In [None]:
# GENERACIÓN DEL CLASSIFIER

In [53]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [54]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

Classifier accuracy percent: 82.0


In [55]:
classifier.show_most_informative_features(15)

Most Informative Features
                      ha = True             True : False  =     10.6 : 1.0
                     lol = True             True : False  =      9.7 : 1.0
                    hope = True             True : False  =      8.7 : 1.0
                   thank = True             True : False  =      8.5 : 1.0
                    well = True             True : False  =      6.9 : 1.0
                    like = True             True : False  =      6.9 : 1.0
                     yes = True             True : False  =      6.9 : 1.0
                    good = True             True : False  =      6.5 : 1.0
                    miss = True            False : True   =      5.6 : 1.0
                    wish = True             True : False  =      5.2 : 1.0
                     way = True             True : False  =      5.1 : 1.0
                      na = True            False : True   =      4.5 : 1.0
                  better = True             True : False  =      4.1 : 1.0

In [57]:
nltk.classify.accuracy(classifier, testing_set)

0.82

Bonus Question 1: Improve Model Performance

In [None]:
# Realizado en un jupyter a parte: Bonus-2

Bonus Question 2: Machine Learning Pipeline

In [58]:
# Realizado en un jupyter a parte: Bonus-2