In [11]:
import sys
import pickle
import random


def file_to_wordset(filename):
    ''' Converts a file with a word per line to a Python set '''
    words = []
    with open(filename, 'r') as f:
        for line in f:
            words.append(line.strip())
    return set(words)


def write_status(i, total):
    ''' Writes status of a process to console '''
    sys.stdout.write('\r')
    sys.stdout.write('Processing %d/%d' % (i, total))
    sys.stdout.flush()


def save_results_to_csv(results, csv_file):
    ''' Save list of type [(tweet_id, positive)] to csv in Kaggle format '''
    with open(csv_file, 'w') as csv:
        csv.write('id,prediction\n')
        for tweet_id, pred in results:
            csv.write(tweet_id)
            csv.write(',')
            csv.write(str(pred))
            csv.write('\n')


def top_n_words(pkl_file_name, N, shift=0):
    """
    Returns a dictionary of form {word:rank} of top N words from a pickle
    file which has a nltk FreqDist object generated by stats.py

    Args:
        pkl_file_name (str): Name of pickle file
        N (int): The number of words to get
        shift: amount to shift the rank from 0.
    Returns:
        dict: Of form {word:rank}
    """
    with open(pkl_file_name, 'rb') as pkl_file:
        freq_dist = pickle.load(pkl_file)
    most_common = freq_dist.most_common(N)
    words = {p[0]: i + shift for i, p in enumerate(most_common)}
    return words


def top_n_bigrams(pkl_file_name, N, shift=0):
    """
    Returns a dictionary of form {bigram:rank} of top N bigrams from a pickle
    file which has a Counter object generated by stats.py

    Args:
        pkl_file_name (str): Name of pickle file
        N (int): The number of bigrams to get
        shift: amount to shift the rank from 0.
    Returns:
        dict: Of form {bigram:rank}
    """
    with open(pkl_file_name, 'rb') as pkl_file:
        freq_dist = pickle.load(pkl_file)
    most_common = freq_dist.most_common(N)
    bigrams = {p[0]: i for i, p in enumerate(most_common)}
    return bigrams


def split_data(tweets, validation_split=0.1):
    """Split the data into training and validation sets

    Args:
        tweets (list): list of tuples
        validation_split (float, optional): validation split %

    Returns:
        (list, list): training-set, validation-set
    """
    index = int((1 - validation_split) * len(tweets))
    random.shuffle(tweets)
    return tweets[:index], tweets[index:]


In [51]:
import utils

# Classifies a tweet based on the number of positive and negative words in it

TRAIN_PROCESSED_FILE = '/Users/saurabhkulkarni/Github-NLP/train-processed.csv'
TEST_PROCESSED_FILE = '/Users/saurabhkulkarni/Downloads/test-processed.csv'
POSITIVE_WORDS_FILE = '/Users/saurabhkulkarni/Github-NLP/positive-words.txt'
NEGATIVE_WORDS_FILE = '/Users/saurabhkulkarni/Github-NLP/negative-words.txt'
TRAIN = False


def classify(processed_csv, test_file=True, **params):
    file = open('/Users/saurabhkulkarni/Github-NLP/positive-words.txt', 'r')
    positive_words = [str(line).split("\n")[0] for line in file.readlines()]
    
    file1 = open('/Users/saurabhkulkarni/Github-NLP/negative-words.txt', 'r')
    negative_words = [str(line).split("\n")[0] for line in file1.readlines()]

    predictions = []
    print(positive_words[0:4])
    print(negative_words[0:4])
    with open(processed_csv, 'rb') as csv:
        for line in csv:
            if test_file:
                tweet_id, tweet = line.strip().split(',')
            else:
                tweet_id, label, tweet = line.strip().split(',')

            pos_count, neg_count = 0, 0
            for word in tweet.split():
                if word in positive_words:
                    print("positive word found :",word)
                    pos_count += 1
                elif word in negative_words:
                    print("negative word found :",word)
                    neg_count += 1
            print(pos_count, neg_count)
            prediction = 1 if pos_count >= neg_count else 0
            if test_file:
                predictions.append((tweet_id, prediction))
            else:
                predictions.append((tweet_id, int(label), prediction))
    return predictions


In [52]:
if TRAIN:
    predictions = classify(TRAIN_PROCESSED_FILE, test_file=(not TRAIN), positive_words=POSITIVE_WORDS_FILE, negative_words=NEGATIVE_WORDS_FILE)
    correct = sum([1 for p in predictions if p[1] == p[2]]) * 100.0 / len(predictions)
    print('Correct = %.2f%%' % correct)
else:
    predictions = classify(TEST_PROCESSED_FILE, test_file=(not TRAIN), positive_words=POSITIVE_WORDS_FILE, negative_words=NEGATIVE_WORDS_FILE)
    save_results_to_csv(predictions, 'baseline.csv')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xef in position 3988: invalid continuation byte

In [10]:
pwd

'/Users/saurabhkulkarni/SaurabhWorkspace/ADS-Project'

NameError: name 'negative_words' is not defined