In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import openai

In [15]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/zhing/nltk_data...


True

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
import re

Helper functions to produce features

In [41]:
def count_positive_negative(words):
    # Use the SentimentIntensityAnalyzer to get sentiment scores for each word
    score = sia.polarity_scores(word)
    scores_pos = [score['pos'] for word in words]
    scores_neg = [score['neg'] for word in words]

    # Get the total number of positive words
    num_pos_words = sum([1 for score in scores_pos if score > 0])
    num_neg_words = sum([1 for score in scores_neg if score > 0])

    # Access value with result[0], result[1]
    return num_pos_words, num_neg_words

In [44]:
def count_positive_followed_by_negative(words):
    # Initialize counters
    num_positive_followed_by_negative = 0

    # Iterate over words in input string
    for i, word in enumerate(words):
        # If current word is negative any of the previous 5 words are positive
        if sia.polarity_scores(word)['compound'] < 0 and any(sia.polarity_scores(w)['compound'] > 0 for w in words[max(0, i-5):i]):
            # Increment counter for positive words followed by negative words
            num_positive_followed_by_negative += 1

    return num_positive_followed_by_negative

In [48]:
def count_proper_nouns(pos_tags):
    num_proper_nouns = sum(1 for word, tag in pos_tags if tag == 'NNP')
    return num_proper_nouns

In [49]:
def count_conjunctions(pos_tags):
    num_conjunctions = sum(1 for word, tag in pos_tags if tag == 'CC')
    return num_conjunctions

In [52]:
def count_superlatives(pos_tags):
    num_superlatives = sum(1 for word, tag in pos_tags if tag == "JJS")
    return num_superlatives

In [45]:
def count_pronouns(text):
    first_person = len(re.findall(r'\b(I|me|my|mine|we|us|our|ours)\b', text, flags=re.IGNORECASE))
    second_person = len(re.findall(r'\b(you|your|yours)\b', text, flags=re.IGNORECASE))
    third_person = len(re.findall(r'\b(he|him|his|she|her|hers|it|its|they|them|their|theirs)\b', text, flags=re.IGNORECASE))
    # first_person, second_person, third_person = count_pronouns(text)
    return first_person, second_person, third_person

In [55]:
def count_hedges(text):
    # Load the list of hedging words from a text file
    with open('hedging_words.txt', 'r') as f:
        hedging_words = [line.strip() for line in f]

    # Use NLTK to tokenize the text into words
    words = nltk.word_tokenize(text)

    # Count the number of hedging words in the text
    num_hedges = sum(1 for word in words if word.lower() in hedging_words)

    return num_hedges

In [57]:
def count_boosts(text):
    # Load the list of hedging words from a text file
    with open('boosting_words.txt', 'r') as f:
        boosting_words = [line.strip() for line in f]

    # Use NLTK to tokenize the text into words
    words = nltk.word_tokenize(text)

    # Count the number of hedging words in the text
    num_boosts = sum(1 for word in words if word.lower() in boosting_words)

    return num_boosts

In [58]:
def count_numbers(text):
    pattern = r"\d{1,3}(,\d{3})*(\.\d+)?"  # regular expression pattern to match numbers
    matches = re.findall(pattern, text)  # find all matches of the pattern in the text
    return len(matches)  # return the count of matches

In [None]:
def count_quotes(text):
    pattern = r"[\s][\p{P}'].+[\p{P}'][\p{P}]?"
    matches = re.findall(pattern, text)
    return len(matches)

In [None]:
def get_features(text):
    sia = SentimentIntensityAnalyzer()
    words = text.split()
    num_pos_words, num_neg_words = count_positive_negative(words)
    num_positive_followed_by_negative = count_positive_followed_by_negative(words)
    
    words = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(words)
    num_proper_nouns = count_proper_nouns(pos_tags)
    num_conjunctions = count_conjunctions(pos_tags)
    num_superlatives = count_superlatives(pos_tags)
    
    first_person, second_person, third_person = count_pronouns(text)
    num_hedges = count_hedges(text)
    num_boosts = count_boosts(text)
    num_numbers = count_numbers(text)
    num_quotes = count_quotes(text)
    
    return num_pos_words, num_neg_words, num_positive_followed_by_negative, first_person, second_person, third_person, num_proper_nouns, num_conjunctions, num_superlatives, num_hedges, num_boosts, num_numbers, num_quotes


Process Data

In [4]:
training_data_filename = "../raw_data/fulltrain.csv"
df_train = pd.read_csv(training_data_filename, names=["label", "document"])

In [None]:
X_train = []
y_train = []

for index, row in df_train.iterrows():
    y_train.append(row["label"])
    text = row["document"]