In [None]:
import json

import boto3
from io import StringIO, BytesIO
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
def lambda_handler(event, context):
    try:
        event_type = event.get('event_type')
        if event_type == 'train':
            result = read_rankings_and_train(event, context)
        elif event_type == 'predict':
            result = predict(event, context)
        return {
            'statusCode': 200,
            'body': json.dumps(result)
        }
    except:
        return {'error': 'Invalid'}

if __name__ == '__main__':
    import pprint
    import sys

    response = lambda_handler({'sentiment_analysis': sys.argv[1]}, None)
    pprint.pprint(response)

In [96]:
def read_rankings_and_train(event, context):
    result = {}
    try:
        s3 = boto3.client('s3', aws_access_key_id='', aws_secret_access_key='')

        filepath_dict = {'yelp': 'sentiment-analysis/yelp_labelled.txt',
                 'amazon': 'sentiment-analysis/amazon_cells_labelled.txt',
                 'imdb': 'sentiment-analysis/imdb_labelled.txt'}

        df_list = []
        for source, filepath in filepath_dict.items():
            obj = s3.get_object(Bucket='ml-data.s3.us-east-1.amazonaws.com', Key=filepath)
            df = pd.read_csv(BytesIO(obj['Body'].read()), names=['sentence', 'label'], sep='\t')
            df['source'] = source  # Add another column filled with the source name
            df_list.append(df)
            df = pd.concat(df_list)
        
        
        total_score = 0

        for source in df['source'].unique():
            df_source = df[df['source'] == source]
            sentences = df_source['sentence'].values
            y = df_source['label'].values

            sentences_train, sentences_test, y_train, y_test = train_test_split(
                sentences, y, test_size=0.25, random_state=1000)

            vectorizer.fit(sentences_train)
            X_train = vectorizer.transform(sentences_train)
            X_test  = vectorizer.transform(sentences_test)

            
            classifier.fit(X_train, y_train)
            score = classifier.score(X_test, y_test)
            print('Accuracy for {} data: {:.4f}'.format(source, score))
            total_score += score

        
        if(total_score):
            total_score /= 3

        result = {
            'training_avg_score': total_score
        }
    except Exception as e:
        print(e)

    return result

In [97]:
def predict(event, context):
    sentences = []
    sentence = event.get('sentence')
    sentences.append(sentence)
    review_transformed = vectorizer.transform(sentences)
    review_result = classifier.predict(review_transformed)
    print(review_result[0])
    review = 'Positive review' if review_result[0] == 1 else 'Negative Review'
    print(review)
    result = {
        "review": review
    }
    return result

In [98]:
vectorizer = CountVectorizer()
classifier = LogisticRegression()

result = read_rankings_and_train()

Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487
1
Positive review
{'review': 'Positive review'}


In [99]:
result = predict(['I hate this'])
print(result)

0
Negative Review
{'review': 'Negative Review'}


In [None]:
obj = s3.get_object(Bucket='ml-data.s3.us-east-1.amazonaws.com', Key='amazon_cells_labelled.txt')
try:
    df = pd.read_csv(BytesIO(obj['Body'].read()))
except Exception as e:
    print(e)