<a href="https://colab.research.google.com/github/iPrinka/MITx-Micromasters-Statistics-Data-Science/blob/main/oneleague_ml_w13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### NLP II

**OBJECTIVES**

- Use `spacy` for basic data exploration and matching
- Use `pytorch` for basic language models
- Use `sklearn` for text classification

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

[Twitter Dataset](https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/twitter_training.csv', header = None)
df.columns = ['id', 'label', 'sentiment', 'tweet']

In [4]:
df.head()

Unnamed: 0,id,label,sentiment,tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


### Intro to `spacy`

In [5]:
import spacy
from nltk.corpus import PlaintextCorpusReader

In [7]:
!python -m spacy download en_core_web_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [8]:
nlp = spacy.load('en_core_web_lg')

In [9]:
sentence = "Fufi is a very good girl, she needs a lot of attention."

In [10]:
doc = nlp(sentence)

In [12]:
for d in doc:
  print(d, d.pos_, d.lemma_)

Fufi PROPN Fufi
is AUX be
a DET a
very ADV very
good ADJ good
girl NOUN girl
, PUNCT ,
she PRON she
needs VERB need
a DET a
lot NOUN lot
of ADP of
attention NOUN attention
. PUNCT .


In [13]:
list(doc.noun_chunks)

[Fufi, a very good girl, she, a lot, attention]

### Matching with Spacy

In [14]:
from spacy.matcher import Matcher

In [16]:
# instantiate with vocabulary
matcher = Matcher(nlp.vocab)

In [17]:
pattern = [{"POS": "NOUN"}]   

In [19]:
matcher.add("Noun Pattern", [pattern])

In [20]:
matcher(doc)

[(2308383331340554964, 5, 6),
 (2308383331340554964, 10, 11),
 (2308383331340554964, 12, 13)]

In [21]:
doc

Fufi is a very good girl, she needs a lot of attention.

### Named Entities

In [22]:
sent = "Geoffrey Hinton worked at Google in California."

In [23]:
doc2 = nlp(sent)

In [24]:
from spacy import displacy

In [25]:
displacy.render(doc2, style='ent', jupyter=True)

In [26]:
doc2.ents

(Geoffrey Hinton, Google, California)

In [27]:
pattern2 = [{"ENT_TYPE": "PERSON"}]
matcher.add('Peeps', [pattern2])
matcher(doc2)

[(4890493497497021583, 0, 1), (4890493497497021583, 1, 2)]

### Dependencies

In [28]:
displacy.render(doc2, jupyter=True)

### Embeddings

In [29]:
doc.vector.shape

(300,)

In [31]:
apple1 = nlp("Apple shares rose on the news")
apple2 = nlp("Apple sold fewer iPhones this quarter")
apple3 = nlp("Apple pie is delicious")

In [32]:
apple1.similarity(apple2)

0.5555513711901393

In [33]:
apple1.similarity(apple3)

0.24332774320089606

In [34]:
apple2.similarity(apple3)

0.3441838714565569

### Activity

In [35]:
from sklearn.base import TransformerMixin, BaseEstimator

In [36]:
class WordVectorTransformer(TransformerMixin, BaseEstimator):

  def __init__(self, model='en_core_web_lg'):
    self.model = model
  
  def fit(self, X, y = None):
    return self
  
  def transform(self, X):
    nlp = spacy.load(self.model)
    return np.concatenate([nlp(doc).vector.reshape(1, -1) for doc in X])

In [45]:
X = df['tweet'][:5]

In [39]:
wvect = WordVectorTransformer()

In [46]:
X.shape

(5,)

In [47]:
sample_vects = wvect.transform(X)

In [48]:
sample_vects.shape

(5, 300)

In [49]:
y = df["sentiment"]

In [None]:
df.dropna(inplace=True)

In [50]:
X = df['tweet']
y = df['sentiment']

### Building Models

- Use `CountVectorizer`
- Use `TfidfVectorizer`
- Use custom word vectors vectorization

In [52]:
y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 74682 entries, 0 to 74681
Series name: sentiment
Non-Null Count  Dtype 
--------------  ----- 
74682 non-null  object
dtypes: object(1)
memory usage: 583.6+ KB


In [53]:
from sklearn.pipeline import Pipeline

In [57]:
pipe1 = Pipeline([('word_vectorizer', CountVectorizer()), 'classifier', LogisticRegression()])

In [58]:
pipe2 = Pipeline([('word_vectorizer', WordVectorTransformer()), 'classifier', LogisticRegression()])

In [None]:
pipe1.fit(X,y)

### Getting News Data

[News API](https://newsapi.org/)