# **NB5.1 - Sentiment Analysis using binary classifier**

## Part 0: Importing Libraries

In [21]:
import sklearn
import os
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn import metrics

from joblib import dump, load
from joblib import Parallel, delayed

In [6]:
pd.set_option('display.max_colwidth', 200)

In [22]:
#pip install pandarallel
import multiprocessing

num_processors = multiprocessing.cpu_count()
print(f'Available CPUs: {num_processors}')

import pandarallel
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=num_processors-1, use_memory_fs=False)

Available CPUs: 16
INFO: Pandarallel will run on 15 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [97]:
# Imports the Google Cloud client library
from google.cloud import storage
# Instantiates a client
storage_client = storage.Client()

# The name for the new bucket
bucket_name = "nlp_final_project_kshitijm"

# Creates the new bucket
bucket = storage_client.bucket(bucket_name)
print(f"Bucket {bucket.name} connected.")

Bucket nlp_final_project_kshitijm connected.


---
## Part 1: Loading Yelp Data

In [9]:
directory = 'https://storage.googleapis.com/msca-bdp-data-open/yelp/'
fileName = 'yelp_train_sentiment.json'

path = directory + fileName
path

'https://storage.googleapis.com/msca-bdp-data-open/yelp/yelp_train_sentiment.json'

In [14]:
%%time

yelp = pd.read_json(path, orient='records', lines=True)
yelp.shape

CPU times: user 1.58 s, sys: 355 ms, total: 1.93 s
Wall time: 2.17 s


(255717, 3)

In [15]:
# examine the first 5 rows
yelp.head(5)

Unnamed: 0,text,label,lang
0,"I love Deagan's. I do. I really do. The atmosphere is cozy and festive. The shrimp tacos and house fries are my standbys. The fries are sometimes good and sometimes great, and the spicy dipping sa...",1,en
1,I love the classes at this gym. Zumba and. Radio Hip Hop are my favorite. This is such a great fun and I love that it is so reasonably priced!,1,en
2,The tables and floor were dirty. I was the only customer on a Saturday nite and the person working the counter ignored me I had a corned beef sandwich. I took three bites and threw it in the trash,0,en
3,"I had an oil change at the 15515 N Scottsdale Road location. When the car was delivered to me, there were two engine warning lights on that had not been on when I drove the car in. The technicia...",0,en
4,The absolute WORST apartment complex I have ever lived in. Moved here from out of state. Hoped to find a decently priced apartment until I got myself settled in. Wow this place has been trash. Lan...,0,en


In [16]:
# define X and y
X = yelp['text']
y = yelp['label']
print(X.shape)
print(y.shape)

(255717,)
(255717,)


In [17]:
# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(191787,)
(63930,)
(191787,)
(63930,)


---
## Part 2: Trying different sklearn pipelines

#### Naive Bayes

In [30]:
pipe_nb = make_pipeline(
    CountVectorizer(lowercase=False, stop_words='english', ngram_range=(1,3)),
    MultinomialNB()
)

%time pipe_nb.fit(X_train, y_train)


CPU times: user 2min 24s, sys: 4.54 s, total: 2min 29s
Wall time: 2min 28s


Pipeline(steps=[('countvectorizer',
                 CountVectorizer(lowercase=False, ngram_range=(1, 3),
                                 stop_words='english')),
                ('multinomialnb', MultinomialNB())])

In [31]:
%time y_pred_nb = pipe_nb.predict(X_test)

CPU times: user 15 s, sys: 91.6 ms, total: 15 s
Wall time: 15 s


In [33]:
print(f"Test Accuracy: {metrics.accuracy_score(y_test, y_pred_nb) * 100:.1f}%")

print(classification_report(y_test, y_pred_nb))

Test Accuracy: 94.6%
              precision    recall  f1-score   support

           0       0.92      0.98      0.95     32016
           1       0.97      0.92      0.94     31914

    accuracy                           0.95     63930
   macro avg       0.95      0.95      0.95     63930
weighted avg       0.95      0.95      0.95     63930



In [39]:
%%time

import os
from joblib import dump

directory = '/home/jupyter/Impact_Analysis_GenAI/05_SentAnalysis_models'
if not os.path.exists(directory):
    os.makedirs(directory)

path = os.path.join(directory, 'pipe_nb.joblib')
dump(pipe_nb, path)

CPU times: user 1min 41s, sys: 4.15 s, total: 1min 45s
Wall time: 1min 45s


['/home/jupyter/Impact_Analysis_GenAI/05_SentAnalysis_models/pipe_nb.joblib']

---
## Part 3: Extrapolating on our dataset

---
#### Loading this model

In [40]:
path_model = '/home/jupyter/Impact_Analysis_GenAI/05_SentAnalysis_models'

In [42]:
%%time 
clf_nb = load(os.path.join(path_model, 'pipe_nb.joblib'))

CPU times: user 52 s, sys: 2.48 s, total: 54.4 s
Wall time: 53.6 s


#### Loading the dataset

In [58]:
# df_filt_2020=pd.read_parquet('gs://nlp_final_project_kshitijm/00_Data/NLP_FP_Data5_2020_Topics.parquet')
df_filt_2020.head()

Unnamed: 0,url,date,language,title,text,cleaned_text,article_source,clean_title,title_tokens,cleaned_text_tokens,year_pub,month,month-year,flag_relevant,num_tokens,rake_phrases_articles,rake_phrases_joined,final_topic
0,https://fusionscienceacademy.com/artificial-intelligence-ai-in-social-media-market-to-witness-slow-growth-owing-to-unfavorable-government-policies-2018-2026/,2020-01-30,en,Artificial Intelligence (AI) in Social Media Market to Witness Slow Growth Owing to Unfavorable Government Policies 2018 – 2026 – Fusion Science Academy,\n\nArtificial Intelligence (AI) in Social Media Market to Witness Slow Growth Owing to Unfavorable Government Policies 2018 – 2026 – Fusion Science Academy\nAbout Us\nOur Writers\nContact Us\nTe...,Artificial Intelligence AI in Social MediaMarket to Witness Slow Growth Owing to Unfavorable Government Policies 20182026Fusion Science AcademyOur WritersUs Terms and ConditionsJoin Our with usF...,2026 – Fusion Science Academy,Artificial Intelligence (AI) in Social Media Market to Witness Slow Growth Owing to Unfavorable Government Policies 2018,"['artificial', 'intelligence', 'ai', 'social', 'media', 'market', 'witness', 'slow', 'growth', 'owing', 'unfavorable', 'government', 'policies']","['artificial', 'intelligence', 'ai', 'social', 'mediamarket', 'witness', 'slow', 'growth', 'owing', 'unfavorable', 'government', 'policies', 'fusion', 'science', 'academyour', 'writersus', 'terms'...",2020,1,Jan 2020,1,22283,"[burkert fluid control systems emerson electric co honeywell internationalkitz corporation rotork plc schlumberger limited tyco international ltd watts water technologies, opportunitiestrends 2024...","burkert fluid control systems emerson electric co honeywell internationalkitz corporation rotork plc schlumberger limited tyco international ltd watts water technologies, opportunitiestrends 2024 ...",6
1,https://health.economictimes.indiatimes.com/news/diagnostics/researchers-develop-ai-system-to-detect-and-grade-prostate-cancer/73187612,2020-01-10,en,"artificial intelligence: Researchers develop AI system to detect and grade prostate cancer, Health News, ET HealthWorld","\n\nartificial intelligence: Researchers develop AI system to detect and grade prostate cancer, Health News, ET HealthWorld\n \nSign in/Sign up\n\n NEWS SITES\n\t \n\nRetail News Auto News ...","artificial intelligence: Researchers develop AI system to detect and grade prostate cancer, Health News, ET HealthWorldSign inSign upWS SITESRetail News Auto News Telecom News Energy News IT News...",,"artificial intelligence: Researchers develop AI system to detect and grade prostate cancer, Health News, ET HealthWorld","['artificial', 'intelligence', 'researchers', 'develop', 'ai', 'detect', 'grade', 'prostate', 'cancer', 'health', 'news', 'et', 'healthworld']","['artificial', 'intelligence', 'researchers', 'develop', 'ai', 'detect', 'grade', 'prostate', 'cancer', 'health', 'news', 'et', 'healthworldsign', 'insign', 'upnews', 'sitesretail', 'news', 'auto'...",2020,1,Jan 2020,1,8087,[economic times ethealthworldhome news hospitalspharmamedical devicesdiagnosticspolicyindustrypeople movementfinancial resultsinterviews blogs feature features trends startups medical specialties ...,economic times ethealthworldhome news hospitalspharmamedical devicesdiagnosticspolicyindustrypeople movementfinancial resultsinterviews blogs feature features trends startups medical specialties o...,1
2,https://heraldpublicist.com/bet-gil-on-ai-final-fantasy-tactics-matches-in-hilarious-twitch-stream/,2020-01-15,en,Bet Gil on AI Final Fantasy Tactics Matches in Hilarious Twitch Stream | Herald Publicist,\n\nBet Gil on AI Final Fantasy Tactics Matches in Hilarious Twitch Stream | Herald Publicist\n\nCONTACT US\nPRIVACY POlICY\n \n\n \n\n \n\n\nNewsTechnologyCricketPOLITICAL NEWSEDUCATION \nMegha...,Bet Gil on AI Final Fantasy Tactics Matches in Hilarious Twitch StreamHerald Publicist NTA WSEDUTIOeghans wedding week war with her father: Court papers reveal message that destroyed relationsh...,Herald Publicist,Bet Gil on AI Final Fantasy Tactics Matches in Hilarious Twitch Stream,"['bet', 'gil', 'ai', 'final', 'fantasy', 'tactics', 'matches', 'hilarious', 'twitch', 'stream']","['bet', 'gil', 'ai', 'final', 'fantasy', 'tactics', 'matches', 'hilarious', 'twitch', 'streamherald', 'publicist', 'contact', 'wedding', 'week', 'war', 'father', 'court', 'papers', 'reveal', 'mess...",2020,1,Jan 2020,1,4458,"[hilarious twitch streamnewstechnologycricketpolitil wsedutionnews technology cricket politil ws edution pete add comment share thisfacebooktwittergoogle pluspinterestlinkedinentertanews bet gil, ...","hilarious twitch streamnewstechnologycricketpolitil wsedutionnews technology cricket politil ws edution pete add comment share thisfacebooktwittergoogle pluspinterestlinkedinentertanews bet gil, s...",1
3,https://honestversion.com/2020/01/24/growth-of-cloud-telecommunication-ai-market-in-global-industry-overview-size-and-share-2019-2024/,2020-01-24,en,"Growth of Cloud Telecommunication AI market in global industry: overview, size and share 2019-2024 – Honest Version","\n\nGrowth of Cloud Telecommunication AI market in global industry: overview, size and share 2019-2024 – Honest Version\n\nSkip to content\n\nMenu\nHOME\nAbout Us\nContact\n \n\n \nHonest Version...","Growth of Cloud AI market in global industry: overview, size and share 20192024Honest Version Skip to content Menu HOHonest Version Growth of Cloud AI market in global industry: overview, size ...",Honest Version,"Growth of Cloud Telecommunication AI market in global industry: overview, size and share 2019-2024","['growth', 'cloud', 'ai', 'market', 'global', 'industry', 'overview', 'size', 'share']","['growth', 'cloud', 'ai', 'market', 'global', 'industry', 'overview', 'size', 'share', 'honest', 'version', 'skip', 'content', 'menu', 'homehonest', 'version', 'growth', 'cloud', 'ai', 'market', '...",2020,1,Jan 2020,1,6048,"[cloud telecommunication ai market strategic assessmentpost navigation previous postpreviouspcb design software market outlooks 2020, forecasts 2025next postnext global security awareness computer...","cloud telecommunication ai market strategic assessmentpost navigation previous postpreviouspcb design software market outlooks 2020, forecasts 2025next postnext global security awareness computerb...",6
4,https://marketresearchsheets.com/2020/01/31/global-artificial-intelligence-as-a-service-market-2019-competitive-analysis-by-introduction-ibm-google-amazon-web-services-microsoft/,2020-01-31,en,"Global Artificial Intelligence as a Service Market 2019 Competitive Analysis by- Introduction, Ibm, Google, Amazon Web Services, Microsoft – Market Research Sheets","\n\nGlobal Artificial Intelligence as a Service Market 2019 Competitive Analysis by- Introduction, Ibm, Google, Amazon Web Services, Microsoft – Market Research Sheets\n\nAbout\nContact\nTeam\nFee...","Global Artificial Intelligence as a Service Market 2019 Competitive Analysis by Introduction, Ibm, Google, Amazon Web Services, MicrosoftMarket Research Sheets AboutTeam Feedback Terms of UseMark...",Market Research Sheets,"Global Artificial Intelligence as a Service Market 2019 Competitive Analysis by- Introduction, Ibm, Google, Amazon Web Services, Microsoft","['global', 'artificial', 'intelligence', 'service', 'market', 'competitive', 'analysis', 'introduction', 'ibm', 'google', 'amazon', 'web', 'services', 'microsoft']","['global', 'artificial', 'intelligence', 'service', 'market', 'competitive', 'analysis', 'introduction', 'ibm', 'google', 'amazon', 'web', 'services', 'microsoftmarket', 'research', 'sheets', 'abo...",2020,1,Jan 2020,1,10098,"[service market trendsyou may also like news global automobile brake caliper market 2019 competitive analysis, 2026comment share thisfacebooktwittergoogle pluspinterestlinkedinrelated news global ...","service market trendsyou may also like news global automobile brake caliper market 2019 competitive analysis, 2026comment share thisfacebooktwittergoogle pluspinterestlinkedinrelated news global a...",6


In [None]:
content=df_filt_2020['cleaned_text'].to_list()
# content.append(df_filt_2020.iloc[3]['cleaned_text'])
content[:2]

In [64]:
%time y_pred_nb = clf_nb.predict(content)

CPU times: user 1min 13s, sys: 275 ms, total: 1min 13s
Wall time: 1min 13s


In [69]:
y_pred_nb

array([0, 0, 0, ..., 0, 0, 0])

In [94]:
pd.options.display.max_colwidth=None
# sent_df=pd.DataFrame({"Articles":content, "Sentiment":y_pred_nb})
sent_df[sent_df.Sentiment==1].sample(1)

Unnamed: 0,Articles,Sentiment
12352,"Artificial Intelligence for Drug Discovery Market to See Major Growth By 2026 Scoop Skip to content Tuesday, May 12, 2020Us Scoop Industry Analytics and Market News Global News Industry Reports Market Report Analysis Forecast Business Opportunity Consumption Status Emerging Trends Future Demands Growth Prospects Market Study Opportunities ForecastYou are hereHomeGlobal NewsArtificial Intelligence for Drug Discovery Market to See Major Growth By 2026Artificial Intelligence for Drug Discovery Market to See Major Growth By 2026May 11, 2020 Navanath R, , The Artificial Intelligence for Drug Discovery Market is well prepared, focusing on the competitive landscape, geographic growth, segmentation and market dynamics, including drivers, constraints and opportunities. It highlights key production, sales and consumption trends so players can improve their sales and growth in the Artificial Intelligence for Drug Discovery Market. It offers a detailed analysis of the competition and the leading companies in the Artificial Intelligence for Drug Discovery Market. Here it focuses on the latest developments, sales, market values, production, gross margin and other important factors in the business of top players operating in the Artificial Intelligence for Drug Discovery Market. Global Artificial Intelligence for Drug Discovery Market was valued at D 175.91 Million in 2018 and is projected to reach D 2,589.81 Million by 2026, growing at a GR of 39.9from 2019 to 2026. GetDownload Sample Copy @The various contributors to the value chain in the Artificial Intelligence for Drug Discovery Market include manufacturers: CorporationDeep GenomicsIBM With a comprehensive quantitative and qualitative analysis, the report offers an encyclopedic and accurate research study on important aspects of the Artificial Intelligence for Drug Discovery Market. It shows key factors that influence the growth of various segments and regions in the Artificial Intelligence for Drug Discovery Market. It also offers SWOT, Porters Five Forces and PESTLE analyzes to thoroughly examine the Artificial Intelligence for Drug Discovery Market. It contains a detailed study of manufacturing costs, upstream and downstream buyers, dealers, marketing strategies and development trends for marketing channels in the Artificial Intelligence for Drug Discovery Market. It also provides strategic advice and recommendations for players to ensure success in the Artificial Intelligence for Drug Discovery Market. Ask for Discount @Table of Contents: Overview: The report begins with an overview of the Artificial Intelligence for Drug Discovery Market, in which the authors discuss the scope of the products, type and application segments as well as the regional markets. This section also contains highlights of the market size analysis. Competition by manufacturers: Here, the analysts give the production share, the share of sales and the average price of the manufacturers for the reporting period 20142019. Readers are also provided with details on products, areas served and production facilities by manufacturers. This section contains another chapter that highlights various competitive situations and trends. Share of production by region: This section shows the gross margin, price, production and revenue of all regional markets examined in the report. Key players: Each player profiled in the report is rated for market growth based on served markets, core business, price, sales, gross margin, production, manufacturing locations, served areas and other factors. Production cost analysis: It includes the analysis of the main raw materials, the analysis of the production cost structure, the analysis of the manufacturing process and the analysis of the industrial chain. Market forecast: It includes the price and trend forecast, the sales and growth rate forecast and the forecast of the production growth rate of the global and regional markets for the forecast period 20192026. Finally, the Artificial Intelligence for Drug Discovery Market offers a general conclusion of research and the feasibility of investing in new projects is assessed. The Artificial Intelligence for Drug Discovery Market is a valuable guide for individuals and companies interested in selling the market. Receive a custom report in your inbox within 24 hours @: Verified market research partners with clients to provide insight into strategic and growth analytics data that help achieve business goals and targets. Our core values include trust, integrity, and authenticity for our clients. Analysts with high expertise in data gathering and governance utilize industry techniques to collate and examine data at all stages. Our analysts are trained to combine modern data collection techniques, superior research methodology, subject expertise and years of collective experience to produce informative and accurate research reports.Us: Mr. Edwyne FernandesCall: 1 650 781 4080Email: emailprotected TAGS: Artificial Intelligence for Drug Discovery Market Size, Artificial Intelligence for Drug Discovery Market Growth, Artificial Intelligence for Drug Discovery Market Forecast, Artificial Intelligence for Drug Discovery Market Analysis, Artificial Intelligence for Drug Discovery Market Trends, Artificial Intelligence for Drug Discovery Market Post navigation Artificial Intelligence Chip Market to See Major Growth By 2026Intelligent Transportation Systems Market to See Major Growth By 2026Related posts3D Printing Market to See Major Growth By 2026May 12, 2020 Navanath R, , The 3D Printing Market is well prepared, focusing on the competitive landscape,...Air Cooling Apparatus Market to See Major Growth By 2026May 12, 2020 Navanath R, , The Air Cooling Apparatus Market is well prepared, focusing on the competitive...Imaging Chemicals Market to See Major Growth By 2026May 12, 2020 Navanath R, , The Imaging Chemicals Market is well prepared, focusing on the competitive landscape,...Leave a Comment Cancel replyRecent Posts3D Printing Market to See Major Growth By 2026 Air Cooling Apparatus Market to See Major Growth By 2026 Imaging Chemicals Market to See Major Growth By 2026 Fat Filled Milk Powder Market Intelligence and Analysis for Period 20182028 UV Disinfection Market to See Major Growth By 2026All Right ReservedProudly powered by WordPressTheme: SuperNews by Acme Themes",1


In [93]:
sent_df['Sentiment'].value_counts()

0    31488
1     1179
Name: Sentiment, dtype: int64

We see positive classes being identified correctly, but not negative classes