In [6]:
import pandas as pd

In [7]:
df = pd.read_csv('df.csv')

In [153]:
len(df)

28682

setup

In [2]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)  

In [1]:
from gensim.models.doc2vec import Doc2Vec
doc2vec_model = Doc2Vec.load("doc2vec_model/doc2vec_wiki_d300_n5_w8_mc50_t12_e10_dbow.model")

In [20]:
import nltk
from nltk import ngrams, pos_tag
from nltk.tokenize import word_tokenize
from collections import Counter

# Initialize dictionary to store results
ngram_results = {i: [] for i in range(1, 11)}

# Loop through each EssaySet
for set_id in range(1, 11):
    # Filter DataFrame for current EssaySet and score1 == 2
    filtered_df = df[(df['EssaySet'] == set_id) & (df['Score1'] == 2)]
    
    # Print progress for each EssaySet
    print(f"Processing EssaySet {set_id} with {len(filtered_df)} essays.")
    
    # Initialize a Counter to count n-grams across all essays in the set
    ngram_counter = Counter()
    
    # Process each EssayText in the filtered DataFrame
    for essay in filtered_df['EssayText']:
        tokens = word_tokenize(essay)
        pos_tags = pos_tag(tokens)
        
        # Generate bi-grams, tri-grams, and tetra-grams
        bi_grams = list(ngrams(pos_tags, 2))
        tri_grams = list(ngrams(pos_tags, 3))
        tetra_grams = list(ngrams(pos_tags, 4))
        
        # Concatenate all n-grams into a single list and update the counter
        all_ngrams = bi_grams + tri_grams + tetra_grams
        ngram_counter.update(all_ngrams)
    
    # Filter n-grams that appeared at least 3 times
    frequent_ngrams = [ngram for ngram, count in ngram_counter.items() if count >= 3]
    
    # Store the frequent n-grams in the results dictionary
    ngram_results[set_id] = frequent_ngrams
    
    # Print after processing each set
    print(f"Completed processing EssaySet {set_id}.")

Processing EssaySet 1 with 886 essays.
Completed processing EssaySet 1.
Processing EssaySet 2 with 751 essays.
Completed processing EssaySet 2.
Processing EssaySet 3 with 699 essays.
Completed processing EssaySet 3.
Processing EssaySet 4 with 230 essays.
Completed processing EssaySet 4.
Processing EssaySet 5 with 73 essays.
Completed processing EssaySet 5.
Processing EssaySet 6 with 126 essays.
Completed processing EssaySet 6.
Processing EssaySet 7 with 723 essays.
Completed processing EssaySet 7.
Processing EssaySet 8 with 1313 essays.
Completed processing EssaySet 8.
Processing EssaySet 9 with 1031 essays.
Completed processing EssaySet 9.
Processing EssaySet 10 with 970 essays.
Completed processing EssaySet 10.


In [25]:
len(ngram_results[4])

2196

In [26]:
%load_ext autoreload
%autoreload 2
from utils.feature_extractor import FeatureExtractor

feature_extractor = FeatureExtractor(model, doc2vec_model, ngram_results)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
feature_extractor.word2vec(df["EssayText"][0])

array([ 0.03359786,  0.03068144,  0.08086893,  0.06541708, -0.08403745,
        0.03153593,  0.04938126, -0.08173669,  0.04041249,  0.06147484,
       -0.05967547, -0.12559907,  0.00784169,  0.0573966 , -0.12490447,
        0.07106881, -0.01718471,  0.08739305, -0.04464357, -0.04933896,
       -0.03385805,  0.04380798, -0.0530432 , -0.00938697, -0.0161093 ,
        0.02423825, -0.06357574,  0.03766665,  0.00296783, -0.01800272,
       -0.04343978,  0.01457015, -0.03113324, -0.03124536, -0.00251206,
        0.01467099,  0.0030438 , -0.03163346,  0.05517147,  0.05589162,
        0.11296811, -0.01669577,  0.09594063, -0.02404719, -0.02978118,
       -0.06440105, -0.06560085,  0.05897257, -0.00286683,  0.02105427,
        0.01296466,  0.03543091, -0.05443888, -0.08405603, -0.01502792,
        0.01857127, -0.03955078, -0.04374089,  0.05665323, -0.01266844,
       -0.02411088,  0.0377893 , -0.08461197, -0.08247044, -0.00110129,
       -0.02437194, -0.03194394,  0.08186415,  0.01744146,  0.02

In [9]:
feature_extractor.doc2vec(df["EssayText"][0])

array([-1.79076985e-01,  3.72598357e-02,  3.05328723e-02,  3.18719149e-01,
       -1.23936079e-01,  9.24369991e-02, -3.02028835e-01, -2.08003715e-01,
       -8.07130560e-02, -3.35802250e-02,  3.02772075e-02, -1.59942180e-01,
        2.41348162e-01, -1.34278059e-01, -3.37692462e-02, -1.64364725e-02,
        8.10378417e-02, -2.29826085e-02,  2.22176462e-01,  2.23984703e-01,
        3.44838910e-02,  8.40338971e-03, -1.12910852e-01,  1.87015146e-01,
       -1.75743550e-01,  2.45637208e-01, -2.34440535e-01, -1.05330415e-01,
        3.53634208e-02, -2.70618767e-01, -2.01804966e-01,  2.51013756e-01,
        3.14657301e-01,  5.35868891e-02,  6.05955869e-02,  8.53149965e-02,
       -2.15156041e-02, -2.98972931e-02, -2.89557666e-01,  2.42997110e-01,
       -1.09862544e-01, -5.05009174e-01, -4.39214073e-02, -6.71788156e-02,
        6.42242357e-02, -1.85954139e-01, -2.87785418e-02, -1.29850104e-01,
       -2.13951632e-01, -1.48082197e-01, -1.15448721e-01,  2.22652733e-01,
        3.10764402e-01,  

In [51]:
question_no = 2
len(feature_extractor.pos(df["EssaySet"][question_no], df["EssayText"][question_no]))

6744

train model

In [166]:
from numpy import hstack

# run this for everythting ... so i can just do it once, and then save it as a csv to re-load ... 
def add_features_to_df(df, feature_extractor):
    # Filter DataFrame for training data only
    train_df = df

    train_df['word2vec_features'] = None
    train_df['doc2vec_features'] = None
    train_df['pos_features'] = None

    total_items = len(train_df)
    print(f"Processing {total_items} items...")

    # Iterate over each row in the training DataFrame
    num = 0
    for index, row in train_df.iterrows():
        # Extract features using the feature_extractor object
        word2vec_features = feature_extractor.word2vec(row['EssayText'])
        doc2vec_features = feature_extractor.doc2vec(row['EssayText'])
        pos_features = feature_extractor.pos(row['EssaySet'], row['EssayText'])

        # Assign the extracted features to the DataFrame
        df.at[index, 'word2vec_features'] = word2vec_features
        df.at[index, 'doc2vec_features'] = doc2vec_features
        df.at[index, 'pos_features'] = pos_features

        # Update after every 1000 items
        if (num) % 1000 == 0:
            print(f"Processed {num} items.")
        num += 1

    return df


In [92]:
# Execute the function to add features to the DataFrame and display the updated DataFrame
updated_df = add_features_to_df(df, feature_extractor)

# Save the updated DataFrame to a CSV file
updated_df.to_csv('updated_features.csv', index=False)

Processing 17207 items...
Processed 0 items.
Processed 1000 items.
Processed 2000 items.
Processed 3000 items.
Processed 4000 items.
Processed 5000 items.
Processed 6000 items.
Processed 7000 items.
Processed 8000 items.
Processed 9000 items.
Processed 10000 items.
Processed 11000 items.
Processed 12000 items.
Processed 13000 items.
Processed 14000 items.
Processed 15000 items.
Processed 16000 items.
Processed 17000 items.


In [191]:
# prepare data set 
import numpy as np

essay_set = 3
training_data = updated_df[(updated_df['EssaySet'] == essay_set) & (updated_df['DataSet'] == 'Train')]
training_data = training_data.dropna(subset=['word2vec_features', 'doc2vec_features'])
X_train = np.vstack(training_data[['word2vec_features', 'doc2vec_features', 'pos_features']].apply(lambda x: np.hstack(x), axis=1).values)
y_train = training_data['Score1']

ValueError: need at least one array to concatenate

In [177]:
# train model
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20)}
clf = RandomForestClassifier()
rand_search = RandomizedSearchCV(clf, 
                                param_distributions = param_dist, 
                                n_iter=5, 
                                cv=5)
                                

rand_search.fit(X_train, y_train)
best_clf = rand_search.best_estimator_

test model

In [186]:
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix

val_set = updated_df[(updated_df['DataSet'] == 'Priva') & (updated_df['EssaySet'] == essay_set)]
val_set = val_set.dropna(subset=['word2vec_features', 'doc2vec_features'])

X_val = np.vstack(val_set[['word2vec_features', 'doc2vec_features', 'pos_features']].apply(lambda x: np.hstack(x), axis=1).values)
y_val = val_set['Score1']

y_pred = best_clf.predict(X_val)

conf_matrix = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

kappa_score = cohen_kappa_score(y_val, y_pred)
print(f"Cohen Kappa Score: {kappa_score}")

Confusion Matrix:
[[ 17 132   0]
 [  4 354   2]
 [  2 105  15]]
Cohen Kappa Score: 0.14066147859922173
