## Create customer review related features based on NRC data on four kinds of languages

In this part, we conduct a sentiment analysis based on NRC data on four main kinds of languages.The NRC csv file includes 14182 words in 105 kinds of languages and each word is coded into 2 sentiments and 8 emotions.

Different from the NRC Emotion Lexicon we used in class, here we need to create an emotion dictionary based on a csv in a quite different format and find the related emotions for each word in four languages. 

In [None]:
import pandas as pd
import numpy as np
import datetime

In [None]:
nrc = pd.read_csv('NRC-Emotion-Lexicon-v0.92-In105Languages-Nov2017Translations.csv',encoding = "ISO-8859-1")

In [None]:
nrc.head()

In [None]:
nrc.info()

In [None]:
reviews = pd.read_csv('reviews.csv',encoding = "ISO-8859-1")

In [None]:
reviews.info()

In [None]:
reviews.columns

In [None]:
reviews = reviews[reviews['comments'].notnull()]

In [None]:
nrc.columns

In [None]:
nrc = nrc[['English (en)','French (fr)','Italian (it)','Spanish (es)','Positive', 'Negative', 'Anger', 'Anticipation', 'Disgust', 'Fear',
       'Joy', 'Sadness', 'Surprise', 'Trust']]

In [None]:
emotion_dict=dict()
for x in range(len(nrc)):
    for y in range(4):
        word = nrc.iloc[x][y]
        if emotion_dict.get(word):
            for i in ['Positive','Negative','Anger','Anticipation','Disgust','Fear','Joy','Sadness','Surprise','Trust']:
                if nrc.iloc[x][i]==1:
                    emotion_dict[word].add(i)
        else:
            emotion_dict[word] = set(' '.join(
                list(np.array(['Positive','Negative','Anger','Anticipation','Disgust','Fear','Joy','Sadness','Surprise','Trust']) 
                     * np.array(nrc.iloc[x][4:]))).split())

In [None]:
emotion_dict_final = {k: v for k, v in emotion_dict.items() if v != set()}

In [None]:
emotion_dict_final

In [None]:
reviews.columns

In [None]:
len(reviews.listing_id.unique())

## Remove automatic system reviews

In [None]:
reviews1 = reviews[~reviews.comments.str.contains('The host canceled this reservation')]

In [None]:
reviews1.set_index('listing_id',inplace=True)

In [None]:
reviews1['date']=reviews1['date'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d'))

## Select reviews after 2018-07-01

In [None]:
reviews1=reviews1[reviews1['date']>datetime.datetime.strptime('2018-07-01', '%Y-%m-%d')]

In [None]:
reviews1

In [None]:
review_list = list()
for i in set(reviews1.index.values):
    list_id = i
    if type(reviews1.loc[i]['comments']) == str:
        review_text = reviews1.loc[i]['comments']
    else:
        review_text = ' '.join(list(reviews1.loc[i]['comments']))
    review_list.append((list_id,review_text))

In [None]:
review_list

In [None]:
def emotion_analyzer(text,emotion_dict=emotion_dict_final):
    emotions = {x for y in emotion_dict.values() for x in y} 
    emotion_count = dict()
    for emotion in emotions:
        emotion_count[emotion] = 0
    #Analyze the text and normalize by total number of words
    total_words = len(text.split())
    for word in text.split():
        if emotion_dict.get(word):
            for emotion in emotion_dict.get(word):
                emotion_count[emotion] += 1/total_words
    return emotion_count

In [None]:
def comparative_emotion_analyzer(text_tuples,object_name="listing_id"):
    import pandas as pd
    df = pd.DataFrame(columns=[object_name,'Fear','Trust','Negative',
                           'Positive','Joy','Disgust','Anticipation','Anger',
                           'Sadness','Surprise'],)
    df.set_index(object_name,inplace=True)
    
    output = df    
    for text_tuple in text_tuples:
        text = text_tuple[1] 
        result = emotion_analyzer(text)
        df.loc[text_tuple[0]] = [result['Fear'],result['Trust'],
                  result['Negative'],result['Positive'],result['Joy'],result['Disgust'],
                  result['Anticipation'],result['Anger'],result['Sadness'],result['Surprise']]
    return output

df = comparative_emotion_analyzer(review_list)

In [None]:
df['customer_experience']= df['Fear']*(-10)+ df['Trust']*10 + df['Negative'] *(-5)+df['Positive']*5+df['Joy']*10 + df['Disgust']*(-10)+df['Anticipation'] * 5 + df['Anger'] *(-10)+df['Sadness']*(-5)+df['Surprise']*10

In [None]:
df.to_csv('list_review_rating.csv')

In [None]:
df