# Challenge 1: Natural Language Process Overview

In [1]:
import nltk

In [2]:
nltk.download('brown')

[nltk_data] Downloading package brown to /Users/isi/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [3]:
from nltk.corpus import brown

In [4]:
brown.words()[0:10]

['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of']

In [5]:
brown.tagged_words()[0:10]

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN')]

In [6]:
text = 'Ironhack is a Global Tech School ranked num 2 worldwide.   Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do. This ideology is reflected in our teaching practices, which consist of a nine-weeks immersive programming, UX/UI design or Data Analytics course as well as a one-week hiring fair aimed at helping our students change their career and get a job straight after the course. We are present in 8 countries and have campuses in 9 locations - Madrid, Barcelona, Miami, Paris, Mexico City,  Berlin, Amsterdam, Sao Paulo and Lisbon.'



In [7]:
from nltk import sent_tokenize, word_tokenize

In [8]:
sent_tokenize(text)

['Ironhack is a Global Tech School ranked num 2 worldwide.',
 'Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do.',
 'This ideology is reflected in our teaching practices, which consist of a nine-weeks immersive programming, UX/UI design or Data Analytics course as well as a one-week hiring fair aimed at helping our students change their career and get a job straight after the course.',
 'We are present in 8 countries and have campuses in 9 locations - Madrid, Barcelona, Miami, Paris, Mexico City,  Berlin, Amsterdam, Sao Paulo and Lisbon.']

In [9]:
print(word_tokenize(text))

['Ironhack', 'is', 'a', 'Global', 'Tech', 'School', 'ranked', 'num', '2', 'worldwide', '.', 'Our', 'mission', 'is', 'to', 'help', 'people', 'transform', 'their', 'careers', 'and', 'join', 'a', 'thriving', 'community', 'of', 'tech', 'professionals', 'that', 'love', 'what', 'they', 'do', '.', 'This', 'ideology', 'is', 'reflected', 'in', 'our', 'teaching', 'practices', ',', 'which', 'consist', 'of', 'a', 'nine-weeks', 'immersive', 'programming', ',', 'UX/UI', 'design', 'or', 'Data', 'Analytics', 'course', 'as', 'well', 'as', 'a', 'one-week', 'hiring', 'fair', 'aimed', 'at', 'helping', 'our', 'students', 'change', 'their', 'career', 'and', 'get', 'a', 'job', 'straight', 'after', 'the', 'course', '.', 'We', 'are', 'present', 'in', '8', 'countries', 'and', 'have', 'campuses', 'in', '9', 'locations', '-', 'Madrid', ',', 'Barcelona', ',', 'Miami', ',', 'Paris', ',', 'Mexico', 'City', ',', 'Berlin', ',', 'Amsterdam', ',', 'Sao', 'Paulo', 'and', 'Lisbon', '.']


# Challenge 2: Preparing Text Data For Analysis

In [10]:
s = """@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]")"""

In [11]:
def clean_up(s):
    """
    Cleans up numbers, URLs, and special characters from a string.

    Args:
        s: The string to be cleaned up.

    Returns:
        A string that has been cleaned up.
    """
    import re
    s = re.sub("http://ironhack.com", ' ', s)
    s = re.sub('\d+', ' ', s)
    s = re.sub('\W+', ' ', s)
    s = s.lower().strip()
    return s

In [12]:
s = clean_up(s)
s

'ironhack s q website is'

In [13]:
def tokenize(s):
    """
    Tokenize a string.

    Args:
        s: String to be tokenized.

    Returns:
        A list of words as the result of tokenization.
    """
    from nltk import word_tokenize
    s = word_tokenize(s)
    return s

In [14]:
l = tokenize(s)
l

['ironhack', 's', 'q', 'website', 'is']

In [15]:
from nltk import stem
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/isi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
def stem_and_lemmatize(l):
    """
    Perform stemming and lemmatization on a list of words.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after being stemmed and lemmatized.
    """
    from nltk.stem import PorterStemmer
    from nltk.stem import WordNetLemmatizer
    ps = PorterStemmer()
    l = [ps.stem(word) for word in l]
    lemmatizer = WordNetLemmatizer()
    l = [lemmatizer.lemmatize(word) for word in l]
    return l

In [19]:
l = stem_and_lemmatize(l)
l

['ironhack', 's', 'q', 'websit', 'is']

# Challenge 3: Sentiment Analysis

In [22]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
txt = "Ironhack is a Global Tech School ranked num 2 worldwide.   Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do."
analyzer = SentimentIntensityAnalyzer()
analyzer.polarity_scores(txt)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/isi/nltk_data...


{'neg': 0.0, 'neu': 0.741, 'pos': 0.259, 'compound': 0.8442}

In [23]:
import zipfile
import pandas as pd
zf = zipfile.ZipFile('./Sentiment140.csv.zip') 
df = pd.read_csv(zf.open('Sentiment140.csv'))