# AIAC536 Assignment 1 - Understanding Python with NLP
This assignment is taken from the chapter 2 of the book *Hands-On Python Natural Language Processing* by Aman Kedia and Mayank Rasu.

In [67]:
import nltk

nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kristian.aars/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/kristian.aars/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [68]:
import pandas as pd

data = pd.read_csv("flight_data.csv")
## Find the airport with the longest average flight delay
data.groupby("ORIGIN")["DEP_DELAY"].mean().idxmax()

'PPG'

In [69]:
from sklearn.feature_extraction.text import CountVectorizer

sentence = ["How to change payment method and payment frequency"]
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit_transform(sentence).todense()


matrix([[1, 1, 1, 2]])

## NLTK

In [70]:
from nltk.tokenize import word_tokenize

text = "Who would have thought that computer programs would be analyzing human sentiments"
tokens = word_tokenize(text)
tokens

['Who',
 'would',
 'have',
 'thought',
 'that',
 'computer',
 'programs',
 'would',
 'be',
 'analyzing',
 'human',
 'sentiments']

In [71]:
sw = nltk.corpus.stopwords.words('english')

[word for word in tokens if word not in sw]


['Who',
 'would',
 'thought',
 'computer',
 'programs',
 'would',
 'analyzing',
 'human',
 'sentiments']

### Stemming adnd Lemmatization

In [72]:
from nltk.stem import WordNetLemmatizer

text = "Who would have thought that computer programs would be analyzing human sentiments"
tokens = word_tokenize(text)
lemmatizer = WordNetLemmatizer()

tokens = [lemmatizer.lemmatize(word) for word in tokens]
print(tokens)

['Who', 'would', 'have', 'thought', 'that', 'computer', 'program', 'would', 'be', 'analyzing', 'human', 'sentiment']


In [73]:
from nltk.stem import PorterStemmer

text = "Who would have thought that computer programs would be analyzing human sentiments"
tokens = word_tokenize(text.lower())
ps = PorterStemmer()

[ps.stem(word) for word in tokens]

['who',
 'would',
 'have',
 'thought',
 'that',
 'comput',
 'program',
 'would',
 'be',
 'analyz',
 'human',
 'sentiment']

## POS Tagging (Part of Speech tagging)

In [74]:
text = "Usain Bolt is the fastest runner in the world"
tokens = word_tokenize(text)

[nltk.pos_tag([word]) for word in tokens]

[[('Usain', 'NN')],
 [('Bolt', 'NN')],
 [('is', 'VBZ')],
 [('the', 'DT')],
 [('fastest', 'JJS')],
 [('runner', 'NN')],
 [('in', 'IN')],
 [('the', 'DT')],
 [('world', 'NN')]]

In [75]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

# Textblob

In [76]:
from textblob import TextBlob

print(TextBlob("I love pizza").sentiment)
print(TextBlob("The weather is excellent").sentiment)
print(TextBlob("What a terrible thing to say").sentiment)

Sentiment(polarity=0.5, subjectivity=0.6)
Sentiment(polarity=1.0, subjectivity=1.0)
Sentiment(polarity=-1.0, subjectivity=1.0)


In [77]:
## POS with textblob
TextBlob("The global economy is expected to grow this year").tags

[('The', 'DT'),
 ('global', 'JJ'),
 ('economy', 'NN'),
 ('is', 'VBZ'),
 ('expected', 'VBN'),
 ('to', 'TO'),
 ('grow', 'VB'),
 ('this', 'DT'),
 ('year', 'NN')]

# VADER

In [78]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()
print(analyser.polarity_scores("This book is very good"))
print(analyser.polarity_scores("OMG! This book is so cool!"))


{'neg': 0.0, 'neu': 0.556, 'pos': 0.444, 'compound': 0.4927}
{'neg': 0.0, 'neu': 0.583, 'pos': 0.417, 'compound': 0.5537}


# WebScraping

In [79]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

titles = []
prices = []
ratings = []

url = 'https://webscraper.io/test-sites/e-commerce/allinone/computers/laptops'
request = requests.get(url)

soup = BeautifulSoup(request.text, "html.parser")

for product in soup.find_all('div', {'class': 'col-sm-4 col-lg-4 col-md-4'}):

    for pr in product.find_all('div', {'class': 'caption'}):

        for p in pr.find_all('h4', {'class': 'pull-right price'}):
            prices.append(p.text)

            for title in pr.find_all('a' , {'title'}):
                titles.append(title.get('title'))

                for rt in product.find_all('div', {'class': 'ratings'}):
                    ratings.append(len(rt.find_all('span', {'class': 'glyphicon glyphicon-star'})))

## Save as CSV
product_df = pd.DataFrame(zip(titles,prices,ratings), columns=["Title","Price","Rating"])
product_df.to_csv("ecommerce.csv", index=False)