# Median Baseline

In [2]:
import sys
sys.path.append('../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm
import torch
from collections import defaultdict, Counter
import random
import math
import pickle
import string

import wordfreq
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

import src.eval_metric

%matplotlib inline
%load_ext autoreload
%autoreload 2
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [4]:
train_df = pd.read_csv("../../data/training_data/train.csv")
valid_df = pd.read_csv("../../data/training_data/dev.csv")

In [6]:
output_var_names = ['FFDAvg', 'FFDStd', 'TRTAvg', 'TRTStd']
predict_df = valid_df.copy()
for feat_name in output_var_names:
  predict_df[feat_name] = train_df[feat_name].median()

In [8]:
src.eval_metric.evaluate(predict_df, valid_df)

MAE for FFDAvg: 5.931158700600516
MAE for FFDStd: 2.5783665272617666
MAE for TRTAvg: 8.999091204200285
MAE for TRTStd: 5.88665248133298
Overall MAE: 5.848817228348887


5.848817228348887

## Simple Feature-based Regression

In [9]:
input_var_names = ['length', 'logfreq', 'has_upper', 'has_punct']
def get_features(token):
  token = token.replace('<EOS>', '')
  return pd.Series({
    'length': len(token),
    'logfreq': wordfreq.zipf_frequency(token, 'en'),
    'has_upper': 0 if token.lower() == token else 1,
    'has_punct': 1 if any(j in string.punctuation for j in token) else 0,
  })

def clip_to_100(val):
  if val < 0:
    return 0
  if val > 100:
    return 100
  return val

In [10]:
train_df[input_var_names] = train_df.word.apply(get_features)

In [11]:
valid_df[input_var_names] = valid_df.word.apply(get_features)

In [12]:
predict_df = valid_df.copy()
for feat_name in output_var_names:
  #model = LinearRegression()
  model = SVR()
  
  model.fit(train_df[input_var_names], train_df[feat_name])
  predict_df[feat_name] = model.predict(predict_df[input_var_names])
  predict_df[feat_name] = predict_df[feat_name].apply(clip_to_100)

In [13]:
src.eval_metric.evaluate(predict_df, valid_df)

MAE for FFDAvg: 4.625332291544271
MAE for FFDStd: 2.304362286876658
MAE for TRTAvg: 7.38086334355873
MAE for TRTStd: 5.2781108246960144
Overall MAE: 4.897167186668918


4.897167186668918