In [3]:
import nltk
import pandas as pd
import numpy as np
import re

# Challenge 1

In [6]:
from nltk.corpus import brown
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     /Users/EstebanCardona/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [8]:
brown.words()[0:10]

['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of']

In [7]:
brown.tagged_words()[0:10]

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN')]

In [10]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/EstebanCardona/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [14]:
text = 'Ironhack is a Global Tech School ranked num 2 worldwide.   Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do. This ideology is reflected in our teaching practices, which consist of a nine-weeks immersive programming, UX/UI design or Data Analytics course as well as a one-week hiring fair aimed at helping our students change their career and get a job straight after the course. We are present in 8 countries and have campuses in 9 locations - Madrid, Barcelona, Miami, Paris, Mexico City,  Berlin, Amsterdam, Sao Paulo and Lisbon.'

from nltk import sent_tokenize, word_tokenize

print(sent_tokenize(text))

['Ironhack is a Global Tech School ranked num 2 worldwide.', 'Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do.', 'This ideology is reflected in our teaching practices, which consist of a nine-weeks immersive programming, UX/UI design or Data Analytics course as well as a one-week hiring fair aimed at helping our students change their career and get a job straight after the course.', 'We are present in 8 countries and have campuses in 9 locations - Madrid, Barcelona, Miami, Paris, Mexico City,  Berlin, Amsterdam, Sao Paulo and Lisbon.']


In [13]:
print(word_tokenize(text))

['Ironhack', 'is', 'a', 'Global', 'Tech', 'School', 'ranked', 'num', '2', 'worldwide', '.', 'Our', 'mission', 'is', 'to', 'help', 'people', 'transform', 'their', 'careers', 'and', 'join', 'a', 'thriving', 'community', 'of', 'tech', 'professionals', 'that', 'love', 'what', 'they', 'do', '.', 'This', 'ideology', 'is', 'reflected', 'in', 'our', 'teaching', 'practices', ',', 'which', 'consist', 'of', 'a', 'nine-weeks', 'immersive', 'programming', ',', 'UX/UI', 'design', 'or', 'Data', 'Analytics', 'course', 'as', 'well', 'as', 'a', 'one-week', 'hiring', 'fair', 'aimed', 'at', 'helping', 'our', 'students', 'change', 'their', 'career', 'and', 'get', 'a', 'job', 'straight', 'after', 'the', 'course', '.', 'We', 'are', 'present', 'in', '8', 'countries', 'and', 'have', 'campuses', 'in', '9', 'locations', '-', 'Madrid', ',', 'Barcelona', ',', 'Miami', ',', 'Paris', ',', 'Mexico', 'City', ',', 'Berlin', ',', 'Amsterdam', ',', 'Sao', 'Paulo', 'and', 'Lisbon', '.']


# Challenge 2

In [112]:
s = """@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]")"""

#output " ironhack s  q website  is "

In [113]:
def clean_up(s):
    """
    Cleans up numbers, URLs, and special characters from a string.

    Args:
        s: The string to be cleaned up.

    Returns:
        A string that has been cleaned up.
    """
    s = re.sub(r'http.+\.[cnog][oer][mtgv](/?[\w]+)?',' ',s) #cleans url
    s = re.sub(r'\W',' ',s)
    s = re.sub(r'\d',' ',s)
    s=s.lower()
    
    return s

In [114]:
s = clean_up(s)
s

' ironhack s  q website     is             '

In [115]:
def tokenize(s):
    """
    Tokenize a string.

    Args:
        s: String to be tokenized.

    Returns:
        A list of words as the result of tokenization.
    """
    return nltk.word_tokenize(s)

In [116]:
s = tokenize(s)
s

['ironhack', 's', 'q', 'website', 'is']

In [117]:
def stem_and_lemmatize(list_of_words):
    """
    Perform stemming and lemmatization on a list of words.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after being stemmed and lemmatized.
    """
    roots = []
    ps = PorterStemmer()
    for word in list_of_words:
        roots.append(ps.stem(word))
    
    final = []
    lemmatizer = WordNetLemmatizer()
    for word in roots:
        final.append(lemmatizer.lemmatize(word))
        
    return final
      

In [118]:
s = stem_and_lemmatize(s)
s

['ironhack', 's', 'q', 'websit', 'is']

In [119]:
from nltk.corpus import stopwords

In [120]:
def remove_stopwords(list_of_words):
    """
    Remove English stopwords from a list of strings.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after stop words are removed.
    """
    stopWords = set(stopwords.words('english'))
    final=[]
    for word in list_of_words:
        if word not in stopWords:
            final.append(word)
            
    return final

In [121]:
remove_stopwords(s)

['ironhack', 'q', 'websit']

In [122]:
def main_clean(s):
    s = clean_up(s)
    
    s = tokenize(s)
    
    s = stem_and_lemmatize(s)
    
    s= remove_stopwords(s)
    
    return s
    
    

In [123]:
s = """@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]")"""

In [124]:
main_clean(s)

['ironhack', 'q', 'websit']

# challenge 3

In [125]:
import zipfile

zf=zipfile.ZipFile('Sentiment140.csv.zip')

sent=pd.read_csv(zf.open('Sentiment140.csv'), nrows=20000)

In [126]:
sent.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [127]:
sent.shape

(20000, 6)

In [128]:
sent['text_processed'] = sent.text.apply(main_clean)

In [129]:
sent.head()

Unnamed: 0,target,id,date,flag,user,text,text_processed
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[switchfoot, awww, bummer, shoulda, got, david..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,"[upset, updat, hi, facebook, text, might, cri,..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,"[kenichan, dive, mani, time, ball, manag, save..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,"[whole, bodi, feel, itchi, like, fire]"
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","[nationwideclass, behav, mad, whi, becaus, see]"


In [250]:
sent.text_processed[0]

['switchfoot',
 'awww',
 'bummer',
 'shoulda',
 'got',
 'david',
 'carr',
 'third',
 'day']

In [141]:
lista = sent.text_processed.apply(lambda x: x).tolist()

In [143]:
lista[0]

['switchfoot',
 'awww',
 'bummer',
 'shoulda',
 'got',
 'david',
 'carr',
 'third',
 'day']

In [149]:
lista = np.concatenate(lista).tolist()

In [144]:
from sklearn.feature_extraction.text import CountVectorizer

In [150]:
vect = CountVectorizer()
vect.fit(lista)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [159]:
vect.vocabulary_['switchfoot'] #palabra con index

17651

In [160]:
bag_of_words = vect.transform(lista)

In [181]:
wordfreq = {}
for sentence in lista:
    tokens = nltk.word_tokenize(sentence)
    for token in tokens:
        if token not in wordfreq.keys():
            wordfreq[token] = 1
        else:
            wordfreq[token] += 1


In [186]:
ordenado = {k: v for k, v in sorted(wordfreq.items(), key=lambda item: item[1],reverse=True)}

In [278]:
words = list(ordenado)[:5000]

In [279]:
top5000 = {}
c=0
for key,value in ordenado.items():
    if c<5000:
        top5000[key] = value
        c+=1
    else:
        break

    

In [280]:
len(top5000)

5000

In [281]:
len(top5)

5000

In [282]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

In [283]:
featuresets = sent.text_processed.apply(find_features)

In [297]:
fina = featuresets.values.tolist()

In [302]:
featuresets = [(fina[i], True) for i in range(len(fina))]

In [303]:
# set that we'll train our classifier with
training_set = featuresets[:19000]

# set that we'll test against.
testing_set = featuresets[19000:]

In [304]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [305]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

Classifier accuracy percent: 100.0


In [306]:
classifier.show_most_informative_features(15)

Most Informative Features
