In [15]:
import pandas as pd
import plotly.express as px
import numpy as np
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import root_mean_squared_error, r2_score

from textstat import textstat # Used to get reading level
from wordfreq import zipf_frequency # Used to get word frequency
import nltk # Used to classify words

#### Load Data

In [16]:
df = pd.read_csv('Problem_C_Data_Wordle.csv', encoding='latin1');
df.head()

Unnamed: 0,Date,Contest number,Word,Number of reported results,Number in hard mode,1 try,2 tries,3 tries,4 tries,5 tries,6 tries,7 or more tries (X)
0,12/31/2022,560,manly,20380,1899,0,2,17,37,29,12,2
1,12/30/2022,559,molar,21204,1973,0,4,21,38,26,9,1
2,12/29/2022,558,havoc,20001,1919,0,2,16,38,30,12,2
3,12/28/2022,557,impel,20160,1937,0,3,21,40,25,9,1
4,12/27/2022,556,condo,20879,2012,0,2,17,35,29,14,3


#### Fix the typo? And special characters

In [17]:
# Replace the Word 'marxh' in the dataframe with 'march'
df['Word'] = df['Word'].replace('marxh', 'march')
df['Word'] = df['Word'].replace('naï¿½ve', 'naive')

print(len(df[df['Word'] == 'march']))
print(len(df[df['Word'] == 'naive']))

1
1


#### Drop data that we can't use

In [18]:
#df = df.drop(['Contest number', 'Number of reported results', 'Number in hard mode'], axis=1)
df = df.drop(columns=['Contest number', '1 try', '2 tries', '3 tries', '4 tries', '5 tries', '6 tries', '7 or more tries (X)'], axis=1)
df.head()

Unnamed: 0,Date,Word,Number of reported results,Number in hard mode
0,12/31/2022,manly,20380,1899
1,12/30/2022,molar,21204,1973
2,12/29/2022,havoc,20001,1919
3,12/28/2022,impel,20160,1937
4,12/27/2022,condo,20879,2012


#### Create feature columns for word

In [19]:
import string

def vowels_consonants_ratio(word):
    vowels = 'aeiou'
    word = word.lower()
    num_vowels = sum(1 for char in word if char in vowels)
    num_consonants = sum(1 for char in word if char.isalpha() and char not in vowels)
    if num_consonants == 0:
        return 0  # Avoid division by zero
    return num_vowels / num_consonants

# Scrabble letter values
scrabble_values = {
    'a': 1,  'b': 3,  'c': 3,  'd': 2,  'e': 1,  'f': 4,  'g': 2,  'h': 4,
    'i': 1,  'j': 8,  'k': 5,  'l': 1,  'm': 3,  'n': 1,  'o': 1,  'p': 3,
    'q':10,  'r': 1,  's': 1,  't': 1,  'u': 1,  'v': 4,  'w': 4,  'x': 8,
    'y': 4,  'z':10
}

def scrabble_score(word):
    return sum(scrabble_values.get(char.lower(), 0) for char in word)

def letter_repetition_count(word):
    return len(word) - len(set(word.lower()))

def create_sentence(word):
    return f"The word is {word}."

def frequency_score(word):
    return zipf_frequency(word.lower(), 'en')

words_df = pd.read_csv('wordsv21.csv')
word_list = words_df['word'].tolist()
def word_permutations(word, num_diff_letters):
    count = 0
    for w in word_list:
        if sum(c1 != c2 for c1, c2 in zip(word.lower(), w.lower())) == num_diff_letters:
            count += 1
    return count


nltk.download('averaged_perceptron_tagger_eng')
def word_category(word):
    return nltk.pos_tag([word])[0][1]

common_bigrams = set(['th', 'he', 'in', 'er', 'an', 're', 'ed', 'on', 'es', 'st'])
def common_bigrams_count(word):
    word = word.lower()
    return sum(1 for i in range(len(word)-1) if word[i:i+2] in common_bigrams)

def add_letter_count_features(my_df):
    # Get a list of all lowercase letters
    letters = list(string.ascii_lowercase)
    
    # Initialize new columns in the DataFrame for each letter with zeros
    for letter in letters:
        my_df[letter] = 0

    # Function to count letters in a word
    def count_letters(word):
        word = word.lower()
        letter_counts = {}
        for letter in letters:
            letter_counts[letter] = word.count(letter)
        return letter_counts

    for idx, word in my_df['Word'].items():
        letter_counts = count_letters(word)
        for letter, count in letter_counts.items():
            my_df.at[idx, letter] = count

    return my_df

def create_features(my_df):
    my_df['Vowel consonant ratio'] = my_df['Word'].apply(vowels_consonants_ratio)
    my_df['Scrabble score'] = my_df['Word'].apply(scrabble_score)
    my_df['Letter repetition'] = my_df['Word'].apply(letter_repetition_count)
    my_df['Frequency score'] = my_df['Word'].apply(frequency_score)
    my_df['Words 1 away'] = my_df['Word'].apply(lambda row: word_permutations(row, 1)) # Words only one letter permutation away
    my_df['Words 2 away'] = my_df['Word'].apply(lambda row: word_permutations(row, 2)) # Words only one letter permutation away
    my_df['Word category'] = my_df['Word'].apply(word_category)
    my_df['Common bigrams'] = my_df['Word'].apply(common_bigrams_count)
    my_df['Flesch_Reading_Ease'] = my_df['Word'].apply(lambda x: textstat.flesch_reading_ease(create_sentence(x)))
    my_df['Dale_Chall_Readability_Score'] = my_df['Word'].apply(lambda x: textstat.dale_chall_readability_score(create_sentence(x)))
    my_df['Difficult_Words'] = my_df['Word'].apply(lambda x: textstat.difficult_words(create_sentence(x)))
    # my_df['Flesch_Kincaid_Grade'] = my_df['Word'].apply(lambda x: textstat.flesch_kincaid_grade(create_sentence(x)))

    my_df = pd.get_dummies(my_df, columns=['Word category'])
    my_df = add_letter_count_features(my_df)
    return my_df

df = create_features(df)

df.head()

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\leviw\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


Unnamed: 0,Date,Word,Number of reported results,Number in hard mode,Vowel consonant ratio,Scrabble score,Letter repetition,Frequency score,Words 1 away,Words 2 away,...,q,r,s,t,u,v,w,x,y,z
0,12/31/2022,manly,20380,1899,0.25,10,0,3.66,8,149,...,0,0,0,0,0,0,0,0,1,0
1,12/30/2022,molar,21204,1973,0.666667,7,0,2.99,12,91,...,0,1,0,0,0,0,0,0,0,0
2,12/29/2022,havoc,20001,1919,0.666667,13,0,3.5,0,15,...,0,0,0,0,0,1,0,0,0,0
3,12/28/2022,impel,20160,1937,0.666667,9,0,2.36,1,19,...,0,0,0,0,0,0,0,0,0,0
4,12/27/2022,condo,20879,2012,0.666667,8,1,3.63,9,72,...,0,0,0,0,0,0,0,0,0,0


In [20]:
df[df['Word'] == 'mummy']['Letter repetition']

69    2
Name: Letter repetition, dtype: int64

#### Transform date column

In [21]:
def encode_date(my_df):
  my_df['Date'] = pd.to_datetime(my_df['Date'])

  my_df['Month'] = my_df['Date'].dt.month
  my_df['Day of week'] = my_df['Date'].dt.dayofweek  # Monday=0, Sunday=6
  my_df['Year'] = my_df['Date'].dt.year
  my_df['Day'] = my_df['Date'].dt.day

  my_df = pd.get_dummies(my_df, columns=['Month', 'Day of week', 'Day'])
  my_df = my_df.drop(['Date'], axis=1)

  return my_df

df = encode_date(df)
print(df.columns)

Index(['Word', 'Number of reported results', 'Number in hard mode',
       'Vowel consonant ratio', 'Scrabble score', 'Letter repetition',
       'Frequency score', 'Words 1 away', 'Words 2 away', 'Common bigrams',
       ...
       'Day_22', 'Day_23', 'Day_24', 'Day_25', 'Day_26', 'Day_27', 'Day_28',
       'Day_29', 'Day_30', 'Day_31'],
      dtype='object', length=101)


In [22]:
print(len(df.columns))

101


#### Create Model

In [23]:
X = df.drop(columns=['Number of reported results', 'Number in hard mode', 'Word'], axis=1)
y = df[['Number of reported results', 'Number in hard mode']]

# Save features for later encoding
feature_columns = X.columns.tolist()

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

# Initialize the model
xgb = XGBRegressor(objective='reg:squarederror', eval_metric='rmse')

# Wrap the model with MultiOutputRegressor
multi_output_model = MultiOutputRegressor(xgb)

# Train the model
multi_output_model.fit(X_train, y_train)

# Predict the distribution
y_pred = multi_output_model.predict(X_test)


print(f'RMSE: {root_mean_squared_error(y_test, y_pred)}')

# r2_values = {}
# for i, target in enumerate(y.columns):
#     r2_values[target] = r2_score(y_test.iloc[:, i], y_pred[:, i])
#     print(f"R-squared for {target}: {r2_values[target]:.4f}")

# mean_r2 = np.mean(list(r2_values.values()))
# print(f"Mean R-squared: {mean_r2:.4f}")

# test_length = len(y_pred)

# for key in r2_values:
#     r2_values[key] = (1 - r2_values[key]) * (test_length - 1) / (test_length - len(df.columns) - 1)
#     print(r2_values[key])


RMSE: 20837.43202060057


#### Feature Importances

In [24]:
# Show the model feature importances
importances = multi_output_model.estimators_[0].feature_importances_
importances_df = pd.DataFrame(importances, index=X.columns, columns=['Importance'])
importances_df = importances_df.sort_values('Importance', ascending=False)

pd.options.display.max_rows = 100

importances_df

Unnamed: 0,Importance
Month_2,0.4736609
Month_1,0.2356944
Month_3,0.1447512
Month_4,0.05851172
Day_8,0.01365276
Month_5,0.01289475
Day_14,0.008091037
g,0.006461823
f,0.006165216
Day_15,0.004769026


### Graph time

#### Predict for Eerie

In [25]:
eerie_df = pd.DataFrame({
  'Date': '2023-03-01',
  'Word': 'EERIE'
}, index=[0])

eerie_df = create_features(eerie_df)
eerie_df = encode_date(eerie_df)
eerie_df = eerie_df.drop(columns=['Word'], axis=1)

eerie_df = eerie_df.reindex(columns=feature_columns, fill_value=0)

output = multi_output_model.predict(eerie_df)
print(output)

[[229989.6     9957.176]]
