In [17]:
# -*- coding: utf-8 -*-
import json
import collections
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import math
import csv
from collections import Counter

def check_what_lines_we_are_processing(df):
    series = df['line'].value_counts()
    print "we are processing",series.head(), '\n'

def clean_with_regex(regex_columns,patterns,df):
    for column in regex_columns:
        for pattern in patterns:
            df[column].replace(pattern[0],pattern[1], regex = True, inplace=True)
    return df    
    
def get_keywords_from_json():
    interesting_ratings=["1","2","5","6"]
    returned_data={}
    words=[]
    pieces=["subject", "object"]
    with open('data/'+'Alchemy_response_relations_and_keywords'+'.json', 'rb') as fp:
        returned_data = json.load(fp)
        for element in returned_data:
            if returned_data[element]["rating"] in interesting_ratings:
                for sentence in returned_data[element]["relations"]:
                    for piece in pieces:
                        if piece in sentence:
                             if "keywords" in sentence[piece]:
                                for text in sentence[piece]["keywords"]:
                                    keyword=text["text"].lower()
                                    words.append(keyword)
    counts = Counter(words).most_common(10000)
    df = pd.DataFrame(counts, columns=['word', 'frequency'])
    df.to_csv('data/keywords.csv',encoding='utf-8')



def convert_to_float(to_float, df):
    for column in to_float:
        df[column]=df[column].astype(float).fillna(0.0)
    return df


def save_csv(df,hypothesis):
    fileName='data/alchemy_ratings_'+hypothesis+'.csv'
    print fileName  
    df.to_csv(fileName,encoding='utf-8')

def show_file(returned_data, number):
    count = 0
    for element in returned_data:
        if count ==number:
            my_json=returned_data[element]
            print json.dumps(my_json, indent=4, sort_keys=True)
        count = count + 1

def flatten_dictionary(returned_dictionary,review,call,df,commentDb,line): 
    to_split=["location","object","subject"]
    if call == "keywords":
        first_level=json_normalize(returned_dictionary[review],call,['rating','sail_date',"ship","line"])
        second_level=json_normalize(returned_dictionary[review][call])
        together=pd.merge(first_level, second_level, on='text', how='outer')
        df=pd.concat([df, together])
    else:                
        if review in commentDb[line]:
            rating=commentDb[line][review]["rating"]
            for element in returned_dictionary[review][call]:
                second_level=json_normalize(element)
                second_level['review']=review
                second_level['rating']=rating                
                df=pd.concat([df,second_level])
    return df 
        
        
def open_json_review_files(cruiseLines):
    """
    creates dictionary with one or more cruise lines review json files
    """
    lineDb= {}
    commentDb = {}
    totcount = 0
    for line in cruiseLines:
        count = 0
        with open('data/'+line+'.json', 'rb') as fp:
            lineDb[line] = json.load(fp)
            commentDb.update(lineDb)
            for element in commentDb[line]:
                commentDb[line][element]["line"]=line
                count = count + 1 
        totcount=totcount+count
        print "processed", line, len(commentDb[line]),"reviews"
    print 'total processed', totcount, 'reviews'
    return commentDb

def cleanRatings(commentDb,cruiseLines):
    """
    cleans up and aggregates ratings data
    """
    ratingAggregations={"6":"good","5":"good","4":"medium","3":"medium","2":"bad","1":"bad","no rating":"no rating"}
    count = 0 
    for line in cruiseLines:
        for element in commentDb[line]:
            if len(commentDb[line][element]["kind"]) <5:
                   commentDb[line][element]["kind"]="not specified"
            if not commentDb[line][element]["rating"].isdigit():
                    commentDb[line][element]["rating"]="no rating" 
            if len(commentDb[line][element]["ship"]) <5:
                    commentDb[line][element]["ship"]="not available"                     
            commentDb[line][element]["aggregatedRating"]=ratingAggregations[commentDb[line][element]["rating"]] 
    return commentDb



def open_food_keyword_file():
    df = pd.read_csv('data/keywords_with_food.csv',encoding='utf-8')
    food_keywords=df.set_index('word')["total"].to_dict()
    return food_keywords
 
def get_keywords(sentence):
    pieces=["subject", "object"]
    words=[]
    for piece in pieces:
        if piece in sentence:
            if "keywords" in sentence[piece]:
                for text in sentence[piece]["keywords"]:
                    keyword=text["text"].lower()
                    words.append(keyword)
                    sentence[piece]["keywords"]=keyword
    return words,sentence

def is_food(words,food_keywords,food):
    for word in words:
        if word in food_keywords:
            food=True
    return food
                               
    
def make_food_json():
    food_keywords=open_food_keyword_file()
    interesting_ratings=["1","2","5","6"]
    returned_data={}
    count=0
    to_drop=[]
    with open('data/'+'Alchemy_response_relations_and_keywords'+'.json', 'rb') as fp:
        returned_data = json.load(fp)
        for element in returned_data:
            if returned_data[element]["rating"] in interesting_ratings:
                food_sentence=[]
                for sentence in returned_data[element]["relations"]:
                    food=False
                    words,sentence=get_keywords(sentence)
                    food=is_food(words,food_keywords,food)
                    if food:
                        food_sentence.append(sentence)
                returned_data[element]["relations"]= food_sentence
                if len(food_sentence)==0:
                    to_drop.append(element)                                
            else:
                to_drop.append(element)
    for key in to_drop:
                returned_data.pop(key, None)
    return returned_data           

def make_relations_and_keywords_csv_alchemy(returned_relations,commentDb):
    """
    generates the csv file that powers the relations and keywords dashboard
    """
    call="relations"
    print 'in total we have got',call,'data for', len(returned_relations),"reviews"
    df=clean_dictionary_keys(returned_relations,call,commentDb)
    patterns = [(r'[^A-Za-z0-9%\' ]+',''),(r' +',' ')]
    text_columns=["Message","ObjKeyword","Verb","SbjKeyword"]
    df=clean_with_regex(text_columns,patterns,df)
    df["Count"]=1
    save_csv(df,'food_db')
    return df

def clean_dictionary_keys(returned_dictionary,call,commentDb):
    df=pd.DataFrame()
    count_keywords=0
    count_relations=0
    conta=0
    keys_to_drop=["language","status","usage","totalTransactions","url"]
    for review in returned_dictionary:
        if returned_dictionary[review]["language"]!="english":
            print review, "review seems not to be in English, but in", returned_dictionary["language"]
        else:
            for key in keys_to_drop:
                returned_dictionary[review].pop(key, None)
        if "line" in returned_dictionary[review]:        
            line=returned_dictionary[review]["line"]                
            df=flatten_dictionary(returned_dictionary,review,call,df,commentDb,line)         
    to_rename= {"subject.text":"sbjText",  
                 "action.verb.negated":"actVerbNeg",
                "action.text":"actText",
                 "object.text":"objText",
                 "location.text":"locText",
                "subject.sentiment.score":"sbjSentScore",
                "subject.sentiment.type":"SbjSent",
                "object.sentiment.score":"objSentScore",
                "object.sentiment.type":"ObjSent",
               "action.verb.text":"Verb",
               "object.keywords":"ObjKeyword",
                "subject.keywords":"SbjKeyword",
                "rating":"Rating"
               }  
    df.rename(columns=to_rename, inplace=True)
    lista=["Verb","locText","ObjKeyword","SbjKeyword"]
    make_lowercase(lista,df)
    lista=["Verb","ObjKeyword","SbjKeyword","sbjText","actVerbNeg","actText","objText","locText"]
    fill_with=""
    fill_na_columns(df,lista,fill_with) 
    lista=["objSentScore","sbjSentScore"]
    fill_with=0 
    fill_na_columns(df,lista,fill_with)   
    df["actVerbNeg"][df["actVerbNeg"] =="1"]="didn't"
    df["Message"]=df["sbjText"]+" "+df["actVerbNeg"]+" "+df["actText"]+" "+df["objText"]+" "+df["locText"]   
    df=delete_low_sentiment(df)
    blank_out_rules=(("Message",24)) 
    to_drop=["action.lemmatized","action.verb.tense","review","actVerbNeg",
             "actText","locText","objText","sbjText","object.sentimentFromSubject.score",
             "object.sentimentFromSubject.type","location.sentiment.score","location.sentiment.type",
             'sbjSentScore','objSentScore']
    df=drop_columns(to_drop,df) 
    binarize_ratings(df)
    df=blank_out_short_sentences(df, blank_out_rules) 
    print df.info()
    
    return df

def delete_low_sentiment(df): 
    to_float=["sbjSentScore","objSentScore"]
    df=convert_to_float(to_float, df)  
    condition=[(df["sbjSentScore"] >=-.55) & (df["sbjSentScore"] <= .55),0]
    df["sbjSentScore"][condition[0]]=condition[1]
    condition=[(df["objSentScore"] >=-.8) & (df["objSentScore"] <= .8),0]
    df["objSentScore"][condition[0]]=condition[1]
    df.ix[df['objSentScore'] == 0, 'ObjSent'] = "neutral"
    df.ix[df['sbjSentScore'] == 0, 'SbjSent'] = "neutral"
    df=df[(df['SbjSent'] != 'neutral') | (df['ObjSent']!= 'neutral')]
    return df

def binarize_ratings(df): 
    to_float=["Rating"]
    df=convert_to_float(to_float, df)  
    df.ix[df['Rating'] >= 4, 'Rating'] = "good"
    df.ix[df['Rating'] <= 4, 'Rating'] = "bad"
    return df


def make_lowercase(columns,df):
    for column in columns:
        df[column]=df[column].str.lower()
    return df

def drop_columns(to_drop,df):    
    for column in to_drop:
        df.drop(column, axis=1, inplace=True)
    return df    
    
def fill_na_columns(df,lista,fill_with):
    for column in lista:
        df[column].fillna(fill_with,inplace=True) 
    return df   

def blank_out_short_sentences(df,rules):
    df.loc[df[rules[0]].str.len() <=rules[1], rules[0]] = ""
    mask=df["Message"]!=""
    df=delete_useless_rows(mask,df)
    return df 

def delete_useless_rows(mask,df):
    df=df[mask]
    return df

cruiseLines=["Msc"]
def main():
    calls=['relations_and_keywords']
    commentDb=open_json_review_files(cruiseLines)
    commentDb=cleanRatings(commentDb,cruiseLines)
    returned_data=make_food_json() 
    #number = 7
    #show_file(returned_data,number)
    df=make_relations_and_keywords_csv_alchemy(returned_data,commentDb)
    return df


main()



#get_keywords_from_json()

processed Msc 2153 reviews
total processed 2153 reviews
in total we have got

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


 relations data for 963 reviews
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1267 entries, 0 to 0
Data columns (total 7 columns):
Verb          1267 non-null object
ObjKeyword    1267 non-null object
ObjSent       1267 non-null object
Rating        1267 non-null object
SbjKeyword    1267 non-null object
SbjSent       1267 non-null object
Message       1267 non-null object
dtypes: object(7)
memory usage: 79.2+ KB
None
data/alchemy_ratings_food_db.csv


Unnamed: 0,Verb,ObjKeyword,ObjSent,Rating,SbjKeyword,SbjSent,Message,Count
0,be,,positive,good,pizza,neutral,the pizza was amazing I can definitely attest ...,1
0,recommend,breakfast,positive,good,,neutral,I recommend you serve your awesome tiramisu ev...,1
0,find,meal,negative,good,,neutral,We found that the shoving and pushing and the ...,1
0,show,situation,positive,good,table,neutral,The waiter at that table his name escapes me s...,1
0,take,beer,neutral,bad,bar service,negative,bar service takes for a beer or a pina colada,1
0,be,grain,positive,good,board bakery,neutral,grilled half tomatoesThe on board bakery was e...,1
0,be,welcome bonus,positive,bad,deck,neutral,The free softserve ice cream served daily outs...,1
0,be,,positive,good,dinner,neutral,Dinner was very enjoyable as well,1
0,have,blueberry cheesecake,positive,good,,neutral,They had excellent tiramisu and blueberry chee...,1
0,give,wartama,positive,good,ta,neutral,ta give of praise to our waiter Wartama who wa...,1
