# HW9 - Classify Dev

In [1]:
# Imports
import pandas as pd
import numpy as np
from scipy.stats import mode

# Non-allowed imports just to test
from json import load
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Data Load
url = 'https://f000.backblazeb2.com/file/jeldridge-data/012-spanish_french/train.csv'
df = pd.read_csv(url)

In [3]:
true = load(open('asdh.json', 'r'))

## Feature Engineering

In [4]:
def count_syllables(word: str) -> int:
    vowels = 'aeiouy'
    count = 0
    word = word.lower()
    for i in range(len(word)):
        if word[i] in vowels and (i == 0 or word[i-1] not in vowels):
            count += 1
    return count

def generate_features(word: str) -> pd.Series:
    """
    Generates features given a word.
    """
    vowels = ['a', 'e', 'i', 'o', 'u']
    conditions = dict()
    
    # Letter Counts
    conditions['e_count'] = sum(1 for letter in word if letter.lower() in 'e')
    conditions['a_count'] = sum(1 for letter in word if letter.lower() in 'a')
    conditions['u_count'] = sum(1 for letter in word if letter.lower() in 'u')
    conditions['o_count'] = sum(1 for letter in word if letter.lower() in 'o')
    
    # Presence
    conditions['ch_presence'] = 'ch' in word.lower()
    conditions['contains_eu'] = 'eu' in word
    
    # Word Meta
    conditions['syllable_count'] = count_syllables(word)
    conditions['word_length'] = len(word)
    conditions['consonant_vowel_ratio'] = (len(word) - sum(word.lower().count(v) for v in vowels)) /\
                                        max(1, sum(word.lower().count(v) for v in vowels))
    
    # Prefix/Suffix Analysis
    conditions['starts_with_pre'] = word.startswith('pre')
    conditions['starts_with_re'] = word.startswith('re') 
    conditions['ends_with_cion'] = word.endswith('cion') 
    conditions['ends_in_vowel'] = word[-1] in vowels
    conditions['ends_in_two_vowels'] = word[-1] in vowels and word[-2] in vowels
    conditions['ends_in_r'] = word[-1] in 'r'
    
    # Letter Combinations
    conditions['ll_presence'] = 'll' in word
    conditions['qu_presence'] = 'qu' in word
    conditions['ch_presence_fr'] = 'ch' in word
    conditions['ou_presence'] = 'ou' in word
    
    
    return pd.Series(conditions)

proccess_y = lambda y_set: np.array([word == 'spanish' for word in y_set])

In [5]:
features = df.assign(**df['word'].transform(generate_features))
features.head(2)

Unnamed: 0,word,label,e_count,a_count,u_count,o_count,ch_presence,contains_eu,syllable_count,word_length,...,starts_with_pre,starts_with_re,ends_with_cion,ends_in_vowel,ends_in_two_vowels,ends_in_r,ll_presence,qu_presence,ch_presence_fr,ou_presence
0,finalmente,spanish,2,1,0,0,False,False,4,10,...,False,False,False,True,False,False,False,False,False,False
1,secar,spanish,1,1,0,0,False,False,2,5,...,False,False,False,False,False,True,False,False,False,False


## Model Test

In [6]:
# Splitting Data
X = features.drop(columns=['label', 'word'])
y = features['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
# Testing Baysian Classifier
from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Baysian Accuracy:", accuracy)

Baysian Accuracy: 0.6277777777777778


In [8]:
# Testing Random Forrest
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)

Random Forest Accuracy: 0.7055555555555556


In [9]:
# Testing Boosting
from sklearn.ensemble import GradientBoostingClassifier
gb_classifier = GradientBoostingClassifier(random_state=42)
gb_classifier.fit(X_train, y_train)
y_pred_gb = gb_classifier.predict(X_test)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print("Gradient Boosting Accuracy:", accuracy_gb)

Gradient Boosting Accuracy: 0.6916666666666667


In [10]:
# Testing Ridge Regression
from sklearn.linear_model import RidgeClassifier
ridge_classifier = RidgeClassifier(random_state=42)
ridge_classifier.fit(X_train, y_train)
y_pred_ridge = ridge_classifier.predict(X_test)
accuracy_ridge = accuracy_score(y_test, y_pred_ridge)
print("Ridge Classifier Accuracy:", accuracy_ridge)

Ridge Classifier Accuracy: 0.6888888888888889


In [11]:
# Trying SVM
from sklearn.svm import LinearSVC
svm_classifier = LinearSVC(random_state=42)
svm_classifier.fit(X_train, y_train)
y_pred_svm = svm_classifier.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("Linear SVM Accuracy:", accuracy_svm)

Linear SVM Accuracy: 0.6888888888888889




In [12]:
# Trying Decision Tree 
from sklearn.tree import DecisionTreeClassifier
tree_classifier = DecisionTreeClassifier(random_state=42)
tree_classifier.fit(X_train, y_train)
y_pred_tree = tree_classifier.predict(X_test)
accuracy_tree = accuracy_score(y_test, y_pred_tree)
print("Decision Tree Accuracy:", accuracy_tree)

Decision Tree Accuracy: 0.6833333333333333


## Implementing Model (Random Forrest)

In [13]:
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.n_classes = len(np.unique(y))
        self.tree = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        if (self.max_depth is not None and depth >= self.max_depth) or n_labels == 1 or n_samples < 2:
            return {'label': mode(y)[0][0]}

        # Find best split
        best_gini = np.inf
        best_feature, best_threshold = None, None
        for feature_idx in range(n_features):
            thresholds = np.unique(X[:, feature_idx])
            for threshold in thresholds:
                left_indices = np.where(X[:, feature_idx] <= threshold)[0]
                right_indices = np.where(X[:, feature_idx] > threshold)[0]
                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue
                gini = self._gini_impurity(y[left_indices], y[right_indices])
                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature_idx
                    best_threshold = threshold

        if best_feature is None:
            return {'label': mode(y)[0][0]}  # If no valid split found, return the majority class
        
        left_indices = np.where(X[:, best_feature] <= best_threshold)[0]
        right_indices = np.where(X[:, best_feature] > best_threshold)[0]

        # Grow left and right subtrees
        left_subtree = self._grow_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._grow_tree(X[right_indices], y[right_indices], depth + 1)

        return {'feature_idx': best_feature,
                'threshold': best_threshold,
                'left': left_subtree,
                'right': right_subtree}

    def _gini_impurity(self, left_y, right_y):
        p_left = len(left_y) / (len(left_y) + len(right_y))
        p_right = len(right_y) / (len(left_y) + len(right_y))
        return p_left * (1 - np.sum(np.square(np.bincount(left_y) / len(left_y)))) + \
               p_right * (1 - np.sum(np.square(np.bincount(right_y) / len(right_y))))

    def predict(self, X):
        return np.array([self._predict_tree(x, self.tree) for x in X])

    def _predict_tree(self, x, tree):
        if 'label' in tree:
            return tree['label']
        else:
            if x[tree['feature_idx']] <= tree['threshold']:
                return self._predict_tree(x, tree['left'])
            else:
                return self._predict_tree(x, tree['right'])

class RandomForest:
    def __init__(self, n_estimators=100, max_depth=None, max_features=None, bootstrap=True):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.trees = []

    def fit(self, X, y):
        n_samples, n_features = X.shape
        if not self.max_features:
            self.max_features = int(np.sqrt(n_features))

        for _ in range(self.n_estimators):
            if self.bootstrap:
                indices = np.random.choice(n_samples, n_samples, replace=True)
            else:
                indices = np.arange(n_samples)
            X_bootstrap = X[indices]
            y_bootstrap = y[indices]

            tree = DecisionTree(max_depth=self.max_depth)
            tree.fit(X_bootstrap, y_bootstrap)
            self.trees.append(tree)

    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        return mode(predictions)[0][0]

rf = RandomForest(n_estimators=100, max_depth=10)
rf.fit(X_train.to_numpy(), proccess_y(y_train))
predictions = rf.predict(X_test.to_numpy())
accuracy_score(predictions, proccess_y(y_test))

  return {'label': mode(y)[0][0]}
  return {'label': mode(y)[0][0]}
  return {'label': mode(y)[0][0]}  # If no valid split found, return the majority class
  return {'label': mode(y)[0][0]}  # If no valid split found, return the majority class
  return mode(predictions)[0][0]
  return mode(predictions)[0][0]


0.6972222222222222