In [4]:
import pandas as pd

In [5]:
import nltk
import numpy as np
import re
import string

import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

In [1]:
import multiprocessing
from pandarallel import pandarallel
import spacy

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [3]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
from google.cloud import storage
storage_client = storage.Client()

bucket_name = "nlp_finalproject"

bucket = storage_client.bucket(bucket_name)
print(f"Bucket {bucket.name} connected.")

Bucket nlp_finalproject connected.


In [7]:
df_yelp_sent = pd.read_parquet("gs://nlp_finalproject/SentimentAnalysis_Clean.parquet", engine='pyarrow')
df_yelp_sent.head(5)

Unnamed: 0,date,title,text,year,cleaned,clean_title,ORG_Ent,sentiment,sentiment_score,Person_Ent
0,2020-02-26,LegalTech Artificial Intelligence Market 2019 ...,LegalTech Artificial Intelligence Market 2019 ...,2020,LegalTech Artificial Intelligence Market 2019 ...,LegalTech Artificial Intelligence Market 2019 ...,[LegalTech Artificial Intelligence Market Tec...,Positive,0.9991,"[Aristocrat Leisure, Thermo Fisher, Woodley Eq..."
4,2020-08-07,Two dead as AI Express flight skids off Kozhik...,Two dead as AI Express flight skids off Kozhik...,2020,Two dead as AI Express flight skids off Kozhik...,Two dead as AI Express flight skids off Kozhik...,"[AI Express, PostBeyond BygoneEpic PowerIn Ret...",Negative,-0.9728,"[Kozhikode airportTop Toggle, Safari, safari,..."
7,2021-02-25,MulticoreWare Inc. Becomes CEVA’s Trusted Part...,\n\nMulticoreWare Inc. Becomes CEVA’s Trusted ...,2021,MulticoreWare Inc. Becomes CEVA s Trusted Part...,MulticoreWare Inc. Becomes CEVA s Trusted Part...,"[MulticoreWare Inc., Trusted Partner for Imagi...",Positive,0.9981,"[Erez Natan, Soumendra Mohanty]"
8,2022-10-06,Healthcare Artificial Intelligence Market Anal...,\n\nHealthcare Artificial Intelligence Market ...,2022,Healthcare Artificial Intelligence Market Anal...,Healthcare Artificial Intelligence Market Anal...,[Healthcare Artificial Intelligence Market Ana...,Positive,0.9993,"[AiCure, Web , Profile Follow]"
9,2020-04-29,Artificial Intelligence As A Service Market : ...,\n\nArtificial Intelligence As A Service Marke...,2020,Artificial Intelligence As A Service Market : ...,Artificial Intelligence As A Service Market : ...,"[Artificial Intelligence As A Service Market, ...",Positive,0.9996,"[Emerging Trends, IRIS AI, Get Sample PDF, Pur..."


In [8]:
num_processors = multiprocessing.cpu_count()
pandarallel.initialize(nb_workers=num_processors-1, use_memory_fs=False, progress_bar = True)

INFO: Pandarallel will run on 15 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [9]:
directory = 'https://storage.googleapis.com/msca-bdp-data-open/yelp/'
fileName = 'yelp_train_sentiment.json'

path = directory + fileName

yelp = pd.read_json(path, orient='records', lines=True)

In [10]:
X = yelp['text']
y = yelp['label']

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

pipe_svm = make_pipeline(
    CountVectorizer(lowercase=False, stop_words='english', ngram_range=(1,3)),
    SGDClassifier(max_iter=100, tol=None,loss='log_loss') 
)

pipe_svm.fit(X_train, y_train)

y_pred = pipe_svm.predict(X_test)

probabilities = pipe_svm.predict_proba(X_test)

In [11]:
def sentiment_score(text):
    prob = pipe_svm.predict_proba([text]) 
    return prob[0]

df_yelp_sent['yelp_sent_score'] = df_yelp_sent['cleaned'].parallel_apply(sentiment_score)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=10889), Label(value='0 / 10889')))…

In [12]:
def sentiment(probabilities):
    if probabilities[0] >= 0.85:
        return 'Negative'
    elif probabilities[1] >= 0.85:
        return 'Positive'
    else:
        return 'Neutral'
        
df_yelp_sent['sentiment_yelp'] = df_yelp_sent['yelp_sent_score'].parallel_apply(sentiment)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=10889), Label(value='0 / 10889')))…

In [13]:
df_yelp_sent['sentiment_yelp'].value_counts()

sentiment_yelp
Negative    94440
Positive    40630
Neutral     28251
Name: count, dtype: int64

In [14]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m67.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.6.0
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [15]:
nlp = spacy.load("en_core_web_sm")

In [16]:
def extract_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
    return entities

df_yelp_sent['GPE_Ent'] = df_yelp_sent['cleaned'].parallel_apply(extract_entities)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=10889), Label(value='0 / 10889')))…

In [18]:
def extract_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ == "PRODUCT"]
    return entities

df_yelp_sent['PROD_Ent'] = df_yelp_sent['cleaned'].parallel_apply(extract_entities)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=10889), Label(value='0 / 10889')))…

In [20]:
df_yelp_sent.to_parquet('gs://nlp_finalproject/FinalData.parquet')

### Result for Yelp Sentiment Analysis Using SVM

- Negative    94440
- Positive    40630
- Neutral     28251

### Result for Yelp Sentiment Analysis Using Logistic Regression

- Negative    103568
- Positive     45077
- Neutral      14676