## NLP APIs

##### The machine learning project is exploring the NLP (Natural language Processing) APIs from AWS and GCP on a dataset of YouTube comments.

### Ingest

In [185]:
import pandas as pd

In [186]:
df = pd.read_csv("GBcomments.csv", error_bad_lines=False)
df.head()

b'Skipping line 113225: expected 4 fields, saw 5\n'
b'Skipping line 158379: expected 4 fields, saw 7\nSkipping line 241590: expected 4 fields, saw 5\nSkipping line 245637: expected 4 fields, saw 7\n'
b'Skipping line 521402: expected 4 fields, saw 5\n'


Unnamed: 0,video_id,comment_text,likes,replies
0,jt2OHQh0HoQ,It's more accurate to call it the M+ (1000) be...,0,0
1,jt2OHQh0HoQ,To be there with a samsung phone\n😂😂😂,1,0
2,jt2OHQh0HoQ,"Thank gosh, a place I can watch it without hav...",0,0
3,jt2OHQh0HoQ,What happened to the home button on the iPhone...,0,0
4,jt2OHQh0HoQ,Power is the disease. Care is the cure. Keep...,0,0


In [187]:
df=df.iloc[0:1000,:]

### EDA

In [189]:
df.describe()

Unnamed: 0,likes,replies
count,1000.0,1000.0
mean,0.434,0.087
std,3.246873,0.376261
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,67.0,4.0


### Modeling-AWS NLP

In [190]:
import pandas as pd
import boto3
import json

In [191]:
comprehend = boto3.client(service_name='comprehend',region_name="us-east-1")

#### Test AWS NLP

In [192]:
text = "It is raining today in Seattle"
print('Calling DetectSentiment')
print(json.dumps(comprehend.detect_sentiment(Text=text, LanguageCode='en'), sort_keys=True, indent=4))
print('End of DetectSentiment\n')

Calling DetectSentiment
{
    "ResponseMetadata": {
        "HTTPHeaders": {
            "content-length": "164",
            "content-type": "application/x-amz-json-1.1",
            "date": "Mon, 18 Mar 2019 05:16:43 GMT",
            "x-amzn-requestid": "0526eb55-493d-11e9-abd6-a53edf85ee6c"
        },
        "HTTPStatusCode": 200,
        "RequestId": "0526eb55-493d-11e9-abd6-a53edf85ee6c",
        "RetryAttempts": 0
    },
    "Sentiment": "NEUTRAL",
    "SentimentScore": {
        "Mixed": 0.0029508057050406933,
        "Negative": 0.003515031188726425,
        "Neutral": 0.8899842500686646,
        "Positive": 0.10354989022016525
    }
}
End of DetectSentiment



### AWS NLP

#### Apply AWS NLP

In [193]:
def create_sentiment(row):
    """Uses AWS Comprehend to Create Sentiments on a DataFrame"""

    try:
      comprehend = boto3.client(service_name='comprehend', region_name="us-east-1")
      payload = comprehend.detect_sentiment(Text=row, LanguageCode='en')  
      sentiment = payload['Sentiment']
    except Exception:
      print("Size exceeded:  Fail")
      return None
    return sentiment

def apply_sentiment(df, column="comment_text"):
    """Uses Pandas Apply to Create Sentiment Analysis"""

    df['Sentiment'] = df[column].apply(create_sentiment)
    return df

In [194]:
import time
start = time.time()

df2 = apply_sentiment(df)

end = time.time()
print(end - start)

400.44319796562195


In [195]:
df2.head()

Unnamed: 0,video_id,comment_text,likes,replies,Sentiment
0,jt2OHQh0HoQ,It's more accurate to call it the M+ (1000) be...,0,0,NEUTRAL
1,jt2OHQh0HoQ,To be there with a samsung phone\n😂😂😂,1,0,NEUTRAL
2,jt2OHQh0HoQ,"Thank gosh, a place I can watch it without hav...",0,0,POSITIVE
3,jt2OHQh0HoQ,What happened to the home button on the iPhone...,0,0,NEUTRAL
4,jt2OHQh0HoQ,Power is the disease. Care is the cure. Keep...,0,0,NEUTRAL


### Modeling-GCP NLP

In [196]:
# Imports the Google Cloud client library
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
import os

In [197]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/Users/xing/desktop/CREDENTIALS.json"

#### test GCP NLP

In [198]:
text = "LeBron James plays for the Cleveland Cavaliers."
client = language.LanguageServiceClient()
document = types.Document(
        content=text,
        type=enums.Document.Type.PLAIN_TEXT)
entities = client.analyze_entities(document).entities

In [199]:
entities

[name: "LeBron James"
type: PERSON
metadata {
  key: "mid"
  value: "/m/01jz6d"
}
metadata {
  key: "wikipedia_url"
  value: "https://en.wikipedia.org/wiki/LeBron_James"
}
salience: 0.8982541561126709
mentions {
  text {
    content: "LeBron James"
    begin_offset: -1
  }
  type: PROPER
}
, name: "Cleveland Cavaliers"
type: ORGANIZATION
metadata {
  key: "mid"
  value: "/m/0jm7n"
}
metadata {
  key: "wikipedia_url"
  value: "https://en.wikipedia.org/wiki/Cleveland_Cavaliers"
}
salience: 0.1017458513379097
mentions {
  text {
    content: "Cleveland Cavaliers"
    begin_offset: -1
  }
  type: PROPER
}
]

#### Apply GCP NLP

In [200]:
def create_sentiment_score(row):
    """Uses GCP to Create Sentiments on a DataFrame"""

    try: 
        document = types.Document(content=row, type=enums.Document.Type.PLAIN_TEXT)  
        sentiment = client.analyze_sentiment(document=document).document_sentiment
        sentiment_score = sentiment.score
    except Exception:
        print("Size exceeded:  Fail")
        return None
    return sentiment_score

def apply_sentiment_score(df, column="comment_text"):
    """Uses Pandas Apply to Create Sentiment Analysis"""

    df['Sentiment_score'] = df[column].apply(create_sentiment_score)
    return df

In [201]:
import time
start = time.time()

df3 = apply_sentiment_score(df2)

end = time.time()
print(end - start)

Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded:  Fail
Size exceeded

In [202]:
df3.head()

Unnamed: 0,video_id,comment_text,likes,replies,Sentiment,Sentiment_score
0,jt2OHQh0HoQ,It's more accurate to call it the M+ (1000) be...,0,0,NEUTRAL,0.7
1,jt2OHQh0HoQ,To be there with a samsung phone\n😂😂😂,1,0,NEUTRAL,0.1
2,jt2OHQh0HoQ,"Thank gosh, a place I can watch it without hav...",0,0,POSITIVE,0.0
3,jt2OHQh0HoQ,What happened to the home button on the iPhone...,0,0,NEUTRAL,-0.5
4,jt2OHQh0HoQ,Power is the disease. Care is the cure. Keep...,0,0,NEUTRAL,0.6


### Compare two NLP APIs

In [204]:
def score_to_numeric(x):
    if x=='POSITIVE':
        return 0.8
    if x=='NEUTRAL':
        return 0.1
    if x=='NEGATIVE':
        return -0.6

In [205]:
df3['Sentiment_AWS'] = df3['Sentiment'].apply(score_to_numeric)
df3.head()

Unnamed: 0,video_id,comment_text,likes,replies,Sentiment,Sentiment_score,Sentiment_AWS
0,jt2OHQh0HoQ,It's more accurate to call it the M+ (1000) be...,0,0,NEUTRAL,0.7,0.1
1,jt2OHQh0HoQ,To be there with a samsung phone\n😂😂😂,1,0,NEUTRAL,0.1,0.1
2,jt2OHQh0HoQ,"Thank gosh, a place I can watch it without hav...",0,0,POSITIVE,0.0,0.8
3,jt2OHQh0HoQ,What happened to the home button on the iPhone...,0,0,NEUTRAL,-0.5,0.1
4,jt2OHQh0HoQ,Power is the disease. Care is the cure. Keep...,0,0,NEUTRAL,0.6,0.1


In [212]:
df_show=df3.dropna()
df_show=df_show.rename(index=str, columns={"Sentiment": "Sentiment_AWS","Sentiment_score":"Sentiment_score_GCP","Sentiment_AWS":"Sentiment_score_AWS"})
df_show.head(10)

Unnamed: 0,video_id,comment_text,likes,replies,Sentiment_AWS,Sentiment_score_GCP,Sentiment_score_AWS
0,jt2OHQh0HoQ,It's more accurate to call it the M+ (1000) be...,0,0,NEUTRAL,0.7,0.1
1,jt2OHQh0HoQ,To be there with a samsung phone\n😂😂😂,1,0,NEUTRAL,0.1,0.1
2,jt2OHQh0HoQ,"Thank gosh, a place I can watch it without hav...",0,0,POSITIVE,0.0,0.8
3,jt2OHQh0HoQ,What happened to the home button on the iPhone...,0,0,NEUTRAL,-0.5,0.1
4,jt2OHQh0HoQ,Power is the disease. Care is the cure. Keep...,0,0,NEUTRAL,0.6,0.1
5,jt2OHQh0HoQ,Keep calm and buy iphone 8 Keep calm and buy i...,0,0,POSITIVE,0.4,0.8
6,jt2OHQh0HoQ,i am a big fan of youtube and u !!!!!!!!!!!!!,0,0,NEUTRAL,0.6,0.1
7,jt2OHQh0HoQ,You will never find Losers who line up and pay...,0,0,NEGATIVE,-0.8,-0.6
8,jt2OHQh0HoQ,*APPLE JUST COMMENTED ON MY LAST VIDEO* I'm cr...,0,0,NEUTRAL,-0.4,0.1
9,jt2OHQh0HoQ,"I'm only here to see Emma, I love her so much!...",0,0,POSITIVE,0.8,0.8


In [206]:
df4=df3.drop(['comment_text', 'likes','replies','Sentiment'], axis=1)

In [207]:
df4.head()

Unnamed: 0,video_id,Sentiment_score,Sentiment_AWS
0,jt2OHQh0HoQ,0.7,0.1
1,jt2OHQh0HoQ,0.1,0.1
2,jt2OHQh0HoQ,0.0,0.8
3,jt2OHQh0HoQ,-0.5,0.1
4,jt2OHQh0HoQ,0.6,0.1


In [208]:
df_compare=df4.groupby(['video_id']).mean()

In [209]:
df_compare["difference"]=df_compare["Sentiment_score"]-df_compare["Sentiment_AWS"]

In [210]:
df_compare=df_compare.rename(index=str, columns={"Sentiment_score": "Sentiment_GCP"})

In [211]:
df_compare.dropna()

Unnamed: 0_level_0,Sentiment_GCP,Sentiment_AWS,difference
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LcZ2AuvxXNA,0.23913,0.206061,0.03307
NsjsmgmbCfc,0.353,0.387,-0.034
T_PuZBdT2iM,0.051,0.056701,-0.005701
YPVcg45W0z4,0.106,0.213131,-0.107131
jt2OHQh0HoQ,0.09375,0.135,-0.04125
w8fAellnPns,0.075,0.128283,-0.053283
zZ2CLmvqfXg,0.079798,0.121212,-0.041414
