Little exercise on using engrams to predict positive or negative sentiment. 

In [12]:
import csv
import json
from nltk import ngrams
from nltk.classify import NaiveBayesClassifier
import nltk

In [14]:
# Function to extract bigrams and trigrams from a text
def extract_ngrams(text):
    bigrams = list(ngrams(text.split(), 2))
    trigrams = list(ngrams(text.split(), 3))
    return {ngram: True for ngram in bigrams + trigrams}

# Read the CSV file and create a mapping from gmap_id to category
category_mapping = {}
with open('yolosac3.csv', mode='r', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        gmap_id = row['gmap_id']
        category = row['category']
        category_mapping[gmap_id] = category

# Read the JSON file
with open('data/yolosac-reviews.json', 'r') as file:
    reviews_data = json.load(file)

# Combine the data
combined_reviews = []
for review in reviews_data:
    gmap_id = review['gmap_id']
    category = category_mapping.get(gmap_id, 'Unknown')  # Default to 'Unknown' if no category found
    if review.get('rating') is not None and isinstance(review['rating'], int):
        sentiment = 'pos' if review['rating'] >= 4 else 'neg'
        combined_reviews.append((review['text'] + " " + category, sentiment))

# Split data into training and testing sets
split_index = int(0.8 * len(combined_reviews))
training_data = combined_reviews[:split_index]
testing_data = combined_reviews[split_index:]

# Extract ngrams and create feature sets
features = [(extract_ngrams(text), label) for (text, label) in training_data]

# Train a Naive Bayes classifier
classifier = NaiveBayesClassifier.train(features)

# Evaluate the classifier
accuracy = nltk.classify.accuracy(classifier, [(extract_ngrams(text), label) for (text, label) in testing_data])
print("Accuracy:", accuracy)

# Predict the sentiment of a new review
review = "The service was not good at all"
ngrams = extract_ngrams(review)
sentiment = classifier.classify(ngrams)
print("Sentiment:", sentiment)