In [28]:
import pandas as pd
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [97]:
df = pd.read_csv('data/df.csv')

In [3]:
len(df)

28682

setup

In [4]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('word2vec_model/GoogleNews-vectors-negative300.bin', binary=True)  

In [5]:
from gensim.models.doc2vec import Doc2Vec
doc2vec_model = Doc2Vec.load("doc2vec_model/doc2vec_wiki_d300_n5_w8_mc50_t12_e10_dbow.model")

In [23]:
import nltk
from nltk import ngrams, pos_tag
from nltk.tokenize import word_tokenize
from collections import Counter

def generate_ngram_results(set_ids, df, count_threshold = 3):
    # Initialize dictionary to store results
    ngram_results = {i: [] for i in set_ids}

    # Loop through each EssaySet
    for set_id in set_ids:
        # Filter DataFrame for current EssaySet and score1 == 2
        filtered_df = df[(df['EssaySet'] == set_id) & (df['Score1'] == 2)]
        
        # Print progress for each EssaySet
        print(f"Processing EssaySet {set_id} with {len(filtered_df)} essays.")
        
        # Initialize a Counter to count n-grams across all essays in the set
        ngram_counter = Counter()
        
        # Process each EssayText in the filtered_df DataFrame
        for essay in filtered_df['EssayText']:
            tokens = word_tokenize(essay)
            pos_tags = pos_tag(tokens)
            
            # Generate bi-grams, tri-grams, and tetra-grams
            bi_grams = list(ngrams(pos_tags, 2))
            tri_grams = list(ngrams(pos_tags, 3))
            tetra_grams = list(ngrams(pos_tags, 4))
            
            # Concatenate all n-grams into a single list and update the counter
            all_ngrams = bi_grams + tri_grams + tetra_grams
            ngram_counter.update(all_ngrams)
        
        # Filter n-grams that appeared at least 3 times
        frequent_ngrams = [ngram for ngram, count in ngram_counter.items() if count >= count_threshold]
        
        # Store the frequent n-grams in the results dictionary
        ngram_results[set_id] = frequent_ngrams
        
        # Print after processing each set
        print(f"Completed processing EssaySet {set_id}, with {len(frequent_ngrams)} n-grams.")
    
    return ngram_results


In [116]:
ngram_results = generate_ngram_results([3], df, 30)

Processing EssaySet 3 with 699 essays.
Completed processing EssaySet 3, with 403 n-grams.


In [83]:
import os

def load_prompts(set_ids):
    prompts = {}
    for set_id in set_ids:
        file_name = f"prompts/asap_{set_id:02d}.txt"
        if os.path.exists(file_name):
            with open(file_name, 'r') as file:
                prompts[set_id] = file.read().strip()
        else:
            print(f"Prompt file {file_name} not found.")
    return prompts

set_ids = list(range(1, 11))  # Example set ids, you can modify this as needed
prompts = load_prompts(set_ids)


In [117]:
from utils.feature_extractor import FeatureExtractor

feature_extractor = FeatureExtractor(model, doc2vec_model, ngram_results, prompts)

train model

In [99]:
from numpy import hstack

# run this for everythting ... so i can just do it once, and then save it as a csv to re-load ... 
def add_features_to_df(df, feature_extractor, essay_set, add_word2vec=False, add_doc2vec=False, add_pos=False, add_prompt_overlap=False):
    if add_word2vec:
        df['word2vec_features'] = None
    if add_doc2vec:
        df['doc2vec_features'] = None
    if add_pos:
        df['pos_features'] = None
    if add_prompt_overlap:
        df['prompt_overlap_features'] = None

    total_items = len(df)
    print(f"Processing {total_items} items...")

    # Iterate over each row in the training DataFrame
    num = 0
    for index, row in df.iterrows():
        # Extract features using the feature_extractor object
        if add_word2vec:
            word2vec_features = feature_extractor.word2vec(row['EssayText'])
            df.at[index, 'word2vec_features'] = word2vec_features
        if add_doc2vec:
            doc2vec_features = feature_extractor.doc2vec(row['EssayText'])
            df.at[index, 'doc2vec_features'] = doc2vec_features
        if add_pos:
            pos_features = feature_extractor.pos(row['EssaySet'], row['EssayText'])
            df.at[index, 'pos_features'] = pos_features
        if add_prompt_overlap:
            prompt_overlap_features = feature_extractor.prompt_overlap(row['EssaySet'], row['EssayText'])
            df.at[index, 'prompt_overlap_features'] = prompt_overlap_features

        # Update after every 1000 items
        if (num) % 1000 == 0:
            print(f"Processed {num} items.")
        num += 1

    return df

In [102]:
essay_set = 3

In [103]:
essay_set_df = df[df['EssaySet'] == essay_set].copy(deep=True)

In [159]:
# Save the updated DataFrame to a CSV file
import os
from datetime import datetime

# Execute the function to add features to the DataFrame and display the updated DataFrame
add_features_to_df(essay_set_df, feature_extractor, 3, add_doc2vec=True)

# Create the folder if it doesn't exist
if not os.path.exists('updated_features'):
    os.makedirs('updated_features')

# Get the current timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Save the updated DataFrame to a CSV file with the timestamp
essay_set_df.to_csv(f'updated_features/updated_features_{timestamp}.csv', index=False)


Processing 3152 items...
Processed 0 items.
Processed 1000 items.
Processed 2000 items.
Processed 3000 items.


In [160]:
# prepare data set 
import numpy as np

training_data = essay_set_df[(essay_set_df['DataSet'] == 'Train')].copy(deep=True)
training_data = training_data.dropna(subset=['word2vec_features', 'doc2vec_features'])
X_train = np.vstack(training_data[['word2vec_features', 'doc2vec_features', 'pos_features', 'prompt_overlap_features']].apply(lambda x: np.hstack(x), axis=1).values)
y_train = training_data['Score1']

In [150]:
# train model; hyper-parameter search
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20)}
clf = RandomForestClassifier()
rand_search = RandomizedSearchCV(clf, 
                                param_distributions = param_dist, 
                                n_iter=5, 
                                cv=5)
                                

rand_search.fit(X_train, y_train)
best_clf = rand_search.best_estimator_

In [169]:
# faster, more estimates
clf = RandomForestClassifier(max_depth=100, n_estimators=100)
clf.fit(X_train, y_train)

best_clf = clf

test model

In [170]:
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix

val_set = essay_set_df[(essay_set_df['DataSet'] == 'Priva')].copy(deep=True)
val_set = val_set.dropna(subset=['word2vec_features', 'doc2vec_features'])

X_val = np.vstack(val_set[['word2vec_features'  , 'doc2vec_features', 'pos_features', 'prompt_overlap_features']].apply(lambda x: np.hstack(x), axis=1).values)
y_val = val_set['Score1']

y_pred = best_clf.predict(X_val)

conf_matrix = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

kappa_score = cohen_kappa_score(y_val, y_pred, weights='quadratic')
print(f"Cohen Kappa Score (Weighted): {kappa_score}")


Confusion Matrix:
[[ 24 123   2]
 [ 14 338   8]
 [  6  98  18]]
Cohen Kappa Score (Weighted): 0.1950373198372678
