# Natural Language Processing

In [2]:
import pandas as pd
import numpy as np
import nltk
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('stopwords')

In [3]:
kittycat = 'We are all agreeing with the cats on this one, and she is too!'

### Basic Cleanup

In [None]:
import re
def clean_up(text):
    import re
    text = re.sub('[^A-Za-z0-9 ]','',text)
    text = text.lower().strip()
    return text
kittycat_clean = clean_up(kittycat)
print(kittycat_clean)

### Tokenization

In [None]:
from nltk.tokenize import word_tokenize
kittycat_tokenize = word_tokenize(kittycat_clean)
print(kittycat_tokenize)

### Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
kittycat_lemmatize = [lemmatizer.lemmatize(item) for item in kittycat_tokenize]
print(kittycat_lemmatize)

### Stemming

In [None]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")

In [None]:
kittycat_stem = [stemmer.stem(item) for item in kittycat_lemmatize]
print(kittycat_stem)

### Removing Stopwords

In [None]:
from nltk.corpus import stopwords

stopwords_list = stopwords.words("english")
kittycat_nostopwords = [item for item in kittycat_stem if not item in stopwords_list]
print(kittycat_nostopwords)

### Vectorizing Text

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
help(CountVectorizer)

In [None]:
vectorizer.fit_transform(kittycat_nostopwords).toarray()

## Applying to real data: IMDB movie reviews

Get the data from here: http://ai.stanford.edu/~amaas/data/sentiment/

An example walkthrough: https://dropsofai.com/sentiment-analysis-with-python-bag-of-words/

In [None]:
from pathlib import Path

# reading positive reviews
txt_folder = Path('aclImdb/train/pos').rglob('*.txt')
files = [x for x in txt_folder]
content = []
for name in files:
    f = open(name, 'r')  
    content.append(f.readlines()[0])
    f.close()
pos = pd.DataFrame(content)

In [None]:
# reading negative reviews
txt_folder = Path('aclImdb/train/neg').rglob('*.txt')
files = [x for x in txt_folder]
content = []
for name in files:
    f = open(name, 'r')  
    content.append(f.readlines()[0])
    f.close()
neg = pd.DataFrame(content)

In [None]:
# we will try to predict whether a review is positive
pos['target'] = 1
neg['target'] = 0

In [None]:
# putting both dataframes together
df = pd.concat([pos, neg], axis = 0)
df.rename(columns = {0:'review'}, inplace = True)

In [None]:
# the dataset is very large, so we are only taking a subset for analysis
df = df.sample(frac=0.25)

## Preparing the data

In [None]:
df['target'].value_counts()

In [None]:
df['review_clean'] = df['review'].apply(clean_up)
df.head()

## Tokenization

In [None]:
from nltk.tokenize import word_tokenize
df['review_tokenize'] = df['review_clean'].apply(word_tokenize)
df.head()

## Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

df['review_lemmatize'] = df['review_tokenize'].apply(lambda row: [lemmatizer.lemmatize(item) for item in row])
df.head()

## Stemming

In [None]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")

df['review_stem'] = df['review_lemmatize'].apply(lambda row: [stemmer.stem(item) for item in row])
df.head()

## Removing Stopwords

In [None]:
from nltk.corpus import stopwords
stopwords_list = stopwords.words("english")

df['review_nostopwords'] = df['review_stem'].apply(lambda row: [item for item in row if not item in stopwords_list])
df.head()

## Vectorizing Text

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer=lambda x: x)

In [None]:
X = vectorizer.fit_transform(df['review_nostopwords']).toarray()

## Splitting into train and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, df['target'], test_size=0.33, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)

In [None]:
pred = clf.predict(X_test)

In [None]:
clf.score(X_test, y_test)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

print(precision_score(y_test, pred))
print(recall_score(y_test, pred))
print(f1_score(y_test, pred))

In [None]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, pred)