# Challenge 1

In [1]:
import nltk

#nltk.download()

In [2]:
from nltk.corpus import brown

brown.words()[0:10]

['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of']

In [3]:
brown.tagged_words()[0:10]

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN')]

In [4]:
text = 'Ironhack is a Global Tech School ranked num 2 worldwide.   Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do. This ideology is reflected in our teaching practices, which consist of a nine-weeks immersive programming, UX/UI design or Data Analytics course as well as a one-week hiring fair aimed at helping our students change their career and get a job straight after the course. We are present in 8 countries and have campuses in 9 locations - Madrid, Barcelona, Miami, Paris, Mexico City,  Berlin, Amsterdam, Sao Paulo and Lisbon.'



In [5]:
from nltk import sent_tokenize, word_tokenize

sent_tokenize(text)

['Ironhack is a Global Tech School ranked num 2 worldwide.',
 'Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do.',
 'This ideology is reflected in our teaching practices, which consist of a nine-weeks immersive programming, UX/UI design or Data Analytics course as well as a one-week hiring fair aimed at helping our students change their career and get a job straight after the course.',
 'We are present in 8 countries and have campuses in 9 locations - Madrid, Barcelona, Miami, Paris, Mexico City,  Berlin, Amsterdam, Sao Paulo and Lisbon.']

In [6]:
word_tokenize(text)

['Ironhack',
 'is',
 'a',
 'Global',
 'Tech',
 'School',
 'ranked',
 'num',
 '2',
 'worldwide',
 '.',
 'Our',
 'mission',
 'is',
 'to',
 'help',
 'people',
 'transform',
 'their',
 'careers',
 'and',
 'join',
 'a',
 'thriving',
 'community',
 'of',
 'tech',
 'professionals',
 'that',
 'love',
 'what',
 'they',
 'do',
 '.',
 'This',
 'ideology',
 'is',
 'reflected',
 'in',
 'our',
 'teaching',
 'practices',
 ',',
 'which',
 'consist',
 'of',
 'a',
 'nine-weeks',
 'immersive',
 'programming',
 ',',
 'UX/UI',
 'design',
 'or',
 'Data',
 'Analytics',
 'course',
 'as',
 'well',
 'as',
 'a',
 'one-week',
 'hiring',
 'fair',
 'aimed',
 'at',
 'helping',
 'our',
 'students',
 'change',
 'their',
 'career',
 'and',
 'get',
 'a',
 'job',
 'straight',
 'after',
 'the',
 'course',
 '.',
 'We',
 'are',
 'present',
 'in',
 '8',
 'countries',
 'and',
 'have',
 'campuses',
 'in',
 '9',
 'locations',
 '-',
 'Madrid',
 ',',
 'Barcelona',
 ',',
 'Miami',
 ',',
 'Paris',
 ',',
 'Mexico',
 'City',
 ',',
 '

# Challenge 2

In [7]:
import re

test_string = '''@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]")'''
expected_output = 'ironhack s  q website  is'

def clean_up(s):
    """
    Cleans up numbers, URLs, and special characters from a string.

    Args:
        s: The string to be cleaned up.

    Returns:
        A string that has been cleaned up.
    """
    res = re.sub('[0-9]+', '', s).lower()
    res = re.sub('http://\w+\.\w+', '', res)
    
    for letter in res:
        if not letter.isalpha() and letter != ' ':
            res = res.replace(letter,' ')
            
    #res = re.sub('http', '', res)
    return res.strip()

In [8]:
clean_up(test_string) == expected_output

True

In [9]:
from nltk.tokenize import word_tokenize

expected_list = ['ironhack', 's', 'q', 'website', 'is']

def tokenize(s):
    """
    Tokenize a string.

    Args:
        s: String to be tokenized.

    Returns:
        A list of words as the result of tokenization.
    """
    return word_tokenize(s)

In [10]:
string = clean_up(test_string)
tokenized = tokenize(string)
tokenized

['ironhack', 's', 'q', 'website', 'is']

In [11]:
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
print(" ".join(SnowballStemmer.languages))

def stem_and_lemmatize(l):
    """
    Perform stemming and lemmatization on a list of words.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after being stemmed and lemmatized.
    """
    stemmer = SnowballStemmer('english')
    lemmatizer = WordNetLemmatizer()
    
    res = []
    for word in l:
        stem_word = stemmer.stem(word) 
        res.append(lemmatizer.lemmatize(stem_word))
        
    return res

arabic danish dutch english finnish french german hungarian italian norwegian porter portuguese romanian russian spanish swedish


In [12]:
sl_ized = stem_and_lemmatize(tokenized)
sl_ized

['ironhack', 's', 'q', 'websit', 'is']

In [13]:
from nltk.corpus import stopwords

def remove_stopwords(l):
    """
    Remove English stopwords from a list of strings.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after stop words are removed.
    """
    stopWords = set(stopwords.words('english'))
    return [w for w in l if w not in stopWords]

In [14]:
remove_stopwords(sl_ized)

['ironhack', 'q', 'websit']

# Challenge 3

In [15]:
import pandas as pd
import zipfile
#import dask.dataframe as dd
import multiprocessing as mp

zf = zipfile.ZipFile('Sentiment140.csv.zip')
df = pd.read_csv(zf.open('Sentiment140.csv'))

In [16]:
def prepareText(text):
    cleaned = clean_up(text)
    tokenized = tokenize(cleaned)
    sl_ized = stem_and_lemmatize(tokenized)
    return remove_stopwords(sl_ized)

In [17]:
pool = mp.Pool(processes=(mp.cpu_count()-1))

df['text_processed'] = pool.map(prepareText, df.text)
df.head()

Unnamed: 0,target,id,date,flag,user,text,text_processed
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[switchfoot, yzl, awww, bummer, shoulda, got, ..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,"[upset, updat, facebook, text, might, cri, res..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,"[kenichan, dive, mani, time, ball, manag, save..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,"[whole, bodi, feel, itchi, like, fire]"
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","[nationwideclass, behav, mad, whi, becaus, see]"


Process ForkPoolWorker-4:
Process ForkPoolWorker-7:
Process ForkPoolWorker-5:
Process ForkPoolWorker-2:
Process ForkPoolWorker-1:
Process ForkPoolWorker-6:
Process ForkPoolWorker-3:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/linuxbrew/.linuxbrew/opt/python/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/linuxbrew/.linuxbrew/opt/python/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/linuxbrew/.linuxbrew/opt/python/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/linuxbrew/.linuxbrew/opt/python/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/linuxbrew/.linuxbrew/opt/python/lib/python3.7/m

In [26]:
import numpy as np

df.drop(df[df['text_processed'].astype(str) == '[]'].index, inplace=True)

#biglist = pool.map(np.concatenate, list(df['text_processed'].values))
#biglist = df['text_processed'].sum()
biglist = np.concatenate(list(df['text_processed'].values))
biglist

array(['switchfoot', 'yzl', 'awww', ..., 'thenspcc', 'sparkschar',
       'speakinguphh'], dtype='<U116')

In [27]:
len(biglist)

12426737

In [29]:
from nltk.probability import FreqDist

bow = FreqDist()
for word in biglist:
    bow[word] += 1
    
bow

FreqDist({'go': 138721, 'get': 110850, 'day': 109277, 'wa': 105045, 'good': 92571, 'work': 87875, 'like': 83847, 'love': 83064, 'quot': 73415, 'got': 71110, ...})

In [47]:
bow_df = pd.DataFrame(bow.items())
bow_df.rename(columns = {0:'word', 1:'count'}, inplace=True)
bow_df.head()

Unnamed: 0,0,1
0,switchfoot,28
1,yzl,3
2,awww,5655
3,bummer,1573
4,shoulda,358


In [63]:
bow_df.sort_values(by=['count'], ascending=False)
main_bow = bow_df.sort_values(by=['count'], ascending=False).iloc[:5000]
main_bow

Unnamed: 0,word,count
29,go,138721
125,get,110850
9,day,109277
76,wa,105045
208,good,92571
...,...,...
12430,lauri,173
5070,moi,173
5893,clark,173
3281,toll,173


In [77]:
main_bow.word.values

True

In [None]:
features = []
for l in df.text_processed.values:
    tweet_features = {}
    for word in main_bow.word.values:
        tweet_features[word] = word in l
    features.append((tweet_features))
    
features