In [None]:
# Challenge 1 

In [1]:
import nltk

In [2]:
from nltk.corpus import brown
nltk.download('brown')

print(brown.words()[0:10])

[nltk_data] Downloading package brown to /Users/juan/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of']


In [3]:
print(brown.tagged_words()[0:10])

[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN')]


In [4]:
text = '''Ironhack is a Global Tech School ranked num 2 worldwide.
        Our mission is to help people transform their careers and join a thriving 
        community of tech professionals that love what they do. This ideology is reflected 
        in our teaching practices, which consist of a nine-weeks immersive programming, UX/UI 
        design or Data Analytics course as well as a one-week hiring fair aimed at helping our 
        students change their career and get a job straight after the course. 
        We are present in 8 countries and have campuses in 9 locations - Madrid, Barcelona, Miami, 
        Paris, Mexico City,  Berlin, Amsterdam, Sao Paulo and Lisbon.'''

In [5]:
from nltk import sent_tokenize, word_tokenize
nltk.download('punkt')

sent_tokenize(text)

[nltk_data] Downloading package punkt to /Users/juan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['Ironhack is a Global Tech School ranked num 2 worldwide.',
 'Our mission is to help people transform their careers and join a thriving \n        community of tech professionals that love what they do.',
 'This ideology is reflected \n        in our teaching practices, which consist of a nine-weeks immersive programming, UX/UI \n        design or Data Analytics course as well as a one-week hiring fair aimed at helping our \n        students change their career and get a job straight after the course.',
 'We are present in 8 countries and have campuses in 9 locations - Madrid, Barcelona, Miami, \n        Paris, Mexico City,  Berlin, Amsterdam, Sao Paulo and Lisbon.']

In [6]:
word_tokenize(text)[0:20]

['Ironhack',
 'is',
 'a',
 'Global',
 'Tech',
 'School',
 'ranked',
 'num',
 '2',
 'worldwide',
 '.',
 'Our',
 'mission',
 'is',
 'to',
 'help',
 'people',
 'transform',
 'their',
 'careers']

In [None]:
# Challenge 2

In [7]:
in_put = "@Ironhack's-#Q website 776-is http://ironhack.com [(2018)])"
out_put = 'ironhack s  q website  is'

In [8]:
import re

def clean_up(s):
    """
    Cleans up numbers, URLs, and special characters from a string.

    Args:
        s: The string to be cleaned up.

    Returns:
        A string that has been cleaned up.
    """
    
    s = re.sub(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', s) # para limpiar URL
    s = re.sub('\d', ' ', s) # /d Any numeric character
    s = re.sub('\W', ' ', s) # Any non-alphanumeric character
    return s.lower().strip()

print(clean_up(in_put))

ironhack s  q website     is


In [None]:
# Tokenization

In [9]:
def tokenize(s):
    """
    Tokenize a string.

    Args:
        s: String to be tokenized.

    Returns:
        A list of words as the result of tokenization.
    """

    return word_tokenize(clean_up(s))

print(tokenize(in_put))

['ironhack', 's', 'q', 'website', 'is']


In [10]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('was')

lemmatizer.lemmatize('runs', pos='v')

[nltk_data] Downloading package wordnet to /Users/juan/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


'run'

In [11]:
from nltk.stem import SnowballStemmer

def stem_and_lemmatize(s):
    """
    Perform stemming and lemmatization on a list of words.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after being stemmed and lemmatized.
    """
    return [WordNetLemmatizer().lemmatize(SnowballStemmer('english').stem(x)) for x in tokenize(s)]
    
    
print(stem_and_lemmatize(in_put)) # nos ha quitado una 'e' en websit

['ironhack', 's', 'q', 'websit', 'is']


In [None]:
# Stop Words Removal

In [12]:
from nltk.corpus import stopwords 
nltk.download('stopwords')


def remove_stopwords(s):
    """
    Remove English stopwords from a list of strings.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after stop words are removed.
       """
    
    stop_words = set(stopwords.words('english'))
    filtered_sentence = [w for w in stem_and_lemmatize(s) if not w in stop_words]  
    
    return filtered_sentence
        
print(remove_stopwords(in_put))

[nltk_data] Downloading package stopwords to /Users/juan/nltk_data...


['ironhack', 'q', 'websit']


[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Challenge 3

In [13]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

txt = "Ironhack is a Global Tech School ranked num 2 worldwide.   Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do."
analyzer = SentimentIntensityAnalyzer()
analyzer.polarity_scores(txt)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/juan/nltk_data...


{'neg': 0.0, 'neu': 0.741, 'pos': 0.259, 'compound': 0.8442}

In [14]:
import pandas as pd
import zipfile

zf = zipfile.ZipFile('Sentiment140.csv.zip')
sen = pd.read_csv(zf.open('Sentiment140.csv'))
sen.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [15]:
short = sen[:5000] # Lo acortamos para que no tarde muchísimo en ejecutarse porque son 1.6 millones de rows

In [16]:
short['text_processed'] = short['text'].apply(remove_stopwords)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [17]:
#def clean_f(x): 
 #   functions = [clean_up, tokenize, stem_and_lemmatize, remove_stopwords]
  #  for f in functions: 
   # x = f(x)
    #return x
#short['text_processed']=short.text.apply(clean_f)

In [18]:
short.head()

Unnamed: 0,target,id,date,flag,user,text,text_processed
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[switchfoot, zl, awww, bummer, shoulda, got, d..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,"[upset, updat, facebook, text, might, cri, res..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,"[kenichan, dive, mani, time, ball, manag, save..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,"[whole, bodi, feel, itchi, like, fire]"
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","[nationwideclass, behav, mad, whi, becaus, see]"


In [None]:
# Creating Bag of Words

In [19]:
words = []
for x in short.text_processed:
    words += x

In [20]:
words[0:10]

['switchfoot',
 'zl',
 'awww',
 'bummer',
 'shoulda',
 'got',
 'david',
 'carr',
 'third',
 'day']

In [21]:
from nltk.probability import FreqDist

fdist = FreqDist(words)
# sorted(fdist, key=fdist.get, reverse=True)[:5000]

voc = fdist.most_common(5000)
bag_of_words = [x[0] for x in voc]
bag_of_words[0:10]

['go', 'work', 'get', 'wa', 'day', 'like', 'today', 'miss', 'sleep', 'feel']