# HW9 - Classify Dev

In [1]:
# Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Data Load
url = 'https://f000.backblazeb2.com/file/jeldridge-data/012-spanish_french/train.csv'
df = pd.read_csv(url)

## Feature Engineering

In [3]:
def count_syllables(word: str) -> int:
    vowels = 'aeiouy'
    count = 0
    word = word.lower()
    for i in range(len(word)):
        if word[i] in vowels and (i == 0 or word[i-1] not in vowels):
            count += 1
    return count

def generate_features(word: str) -> pd.Series:
    """
    Generates features given a word.
    """
    vowels = ['a', 'e', 'i', 'o', 'u']
    
    conditions = dict()
    conditions['ends_in_vowel'] = word[-1] in vowels
    conditions['ends_in_r'] = word[-1] in 'r'
    conditions['ends_in_two_vowels'] = word[-1] in vowels and word[-2] in vowels
    conditions['contains_eu'] = 'eu' in word
    conditions['e_count'] = sum(1 for letter in word if letter.lower() in 'e')
    conditions['a_count'] = sum(1 for letter in word if letter.lower() in 'a')
    conditions['u_count'] = sum(1 for letter in word if letter.lower() in 'u')
    conditions['ch_presence'] = 'ch' in word.lower()
    conditions['syllable_count'] = count_syllables(word)
    conditions['word_length'] = len(word)
    conditions['consonant_vowel_ratio'] = (len(word) - sum(word.lower().count(v) for v in vowels)) / max(1, sum(word.lower().count(v) for v in vowels))
    
    # Prefix/Suffix Analysis
    conditions['starts_with_pre'] = word.startswith('pre') # Spanish
    conditions['starts_with_re'] = word.startswith('re') # French
    conditions['ends_with_cion'] = word.endswith('cion') 
    conditions['ends_with_ment'] = word.endswith('ment') # French
    
    # Letter Combinations
    conditions['ll_presence'] = 'll' in word.lower() # Spanish
    conditions['qu_presence'] = 'qu' in word.lower() # Spanish
    conditions['ch_presence_fr'] = 'ch' in word.lower() # French
    conditions['ou_presence'] = 'ou' in word.lower() # French
    
    
    return pd.Series(conditions)

In [4]:
generate_features('finalmente')

ends_in_vowel             True
ends_in_r                False
ends_in_two_vowels       False
contains_eu              False
e_count                      2
a_count                      1
u_count                      0
ch_presence              False
syllable_count               4
word_length                 10
consonant_vowel_ratio      1.5
starts_with_pre          False
starts_with_re           False
ends_with_cion           False
ends_with_ment           False
ll_presence              False
qu_presence              False
ch_presence_fr           False
ou_presence              False
dtype: object

In [5]:
features = df.assign(**df['word'].transform(generate_features))
features

Unnamed: 0,word,label,ends_in_vowel,ends_in_r,ends_in_two_vowels,contains_eu,e_count,a_count,u_count,ch_presence,...,word_length,consonant_vowel_ratio,starts_with_pre,starts_with_re,ends_with_cion,ends_with_ment,ll_presence,qu_presence,ch_presence_fr,ou_presence
0,finalmente,spanish,True,False,False,False,2,1,0,False,...,10,1.500000,False,False,False,False,False,False,False,False
1,secar,spanish,False,True,False,False,1,1,0,False,...,5,1.500000,False,False,False,False,False,False,False,False
2,aprendio,spanish,True,False,True,False,1,1,0,False,...,8,1.000000,False,False,False,False,False,False,False,False
3,entrenamiento,spanish,True,False,False,False,3,1,0,False,...,13,1.166667,False,False,False,False,False,False,False,False
4,suave,spanish,True,False,False,False,1,1,1,False,...,5,0.666667,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,milieu,french,True,False,True,True,1,0,1,False,...,6,0.500000,False,False,False,False,False,False,False,False
1196,entierement,french,False,False,False,False,4,0,0,False,...,11,1.200000,False,False,False,True,False,False,False,False
1197,quand,french,False,False,False,False,0,1,1,False,...,5,1.500000,False,False,False,False,False,True,False,False
1198,doree,french,True,False,True,False,2,0,0,False,...,5,0.666667,False,False,False,False,False,False,False,False


## Model Test

In [6]:
features.head(2)

Unnamed: 0,word,label,ends_in_vowel,ends_in_r,ends_in_two_vowels,contains_eu,e_count,a_count,u_count,ch_presence,...,word_length,consonant_vowel_ratio,starts_with_pre,starts_with_re,ends_with_cion,ends_with_ment,ll_presence,qu_presence,ch_presence_fr,ou_presence
0,finalmente,spanish,True,False,False,False,2,1,0,False,...,10,1.5,False,False,False,False,False,False,False,False
1,secar,spanish,False,True,False,False,1,1,0,False,...,5,1.5,False,False,False,False,False,False,False,False


In [7]:
# Splitting Data
X = features.drop(columns=['label', 'word'])
y = features['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
# Testing Baysian Classifier
from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Baysian Accuracy:", accuracy)

Baysian Accuracy: 0.6694444444444444


In [9]:
# Testing Random Forrest
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)

Random Forest Accuracy: 0.6805555555555556


In [10]:
# Testing Boosting
from sklearn.ensemble import GradientBoostingClassifier
gb_classifier = GradientBoostingClassifier(random_state=42)
gb_classifier.fit(X_train, y_train)
y_pred_gb = gb_classifier.predict(X_test)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print("Gradient Boosting Accuracy:", accuracy_gb)

Gradient Boosting Accuracy: 0.6777777777777778


In [11]:
# Testing Ridge Regression
from sklearn.linear_model import RidgeClassifier
ridge_classifier = RidgeClassifier(random_state=42)
ridge_classifier.fit(X_train, y_train)
y_pred_ridge = ridge_classifier.predict(X_test)
accuracy_ridge = accuracy_score(y_test, y_pred_ridge)
print("Ridge Classifier Accuracy:", accuracy_ridge)

Ridge Classifier Accuracy: 0.6916666666666667


In [12]:
# Trying SVM
from sklearn.svm import LinearSVC
svm_classifier = LinearSVC(random_state=42)
svm_classifier.fit(X_train, y_train)
y_pred_svm = svm_classifier.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("Linear SVM Accuracy:", accuracy_svm)

Linear SVM Accuracy: 0.6861111111111111




In [13]:
# Trying Decision Tree 
from sklearn.tree import DecisionTreeClassifier
tree_classifier = DecisionTreeClassifier(random_state=42)
tree_classifier.fit(X_train, y_train)
y_pred_tree = tree_classifier.predict(X_test)
accuracy_tree = accuracy_score(y_test, y_pred_tree)
print("Decision Tree Accuracy:", accuracy_tree)

Decision Tree Accuracy: 0.6472222222222223
