In [405]:
import pandas as pd
import plotly.express as px
import numpy as np
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import root_mean_squared_error

from textstat import textstat # Used to get reading level
from wordfreq import zipf_frequency # Used to get word frequency
import nltk # Used to classify words

#### Load Data

In [406]:
df = pd.read_csv('Problem_C_Data_Wordle.csv', encoding='latin1');
df.head()

Unnamed: 0,Date,Contest number,Word,Number of reported results,Number in hard mode,1 try,2 tries,3 tries,4 tries,5 tries,6 tries,7 or more tries (X)
0,12/31/2022,560,manly,20380,1899,0,2,17,37,29,12,2
1,12/30/2022,559,molar,21204,1973,0,4,21,38,26,9,1
2,12/29/2022,558,havoc,20001,1919,0,2,16,38,30,12,2
3,12/28/2022,557,impel,20160,1937,0,3,21,40,25,9,1
4,12/27/2022,556,condo,20879,2012,0,2,17,35,29,14,3


#### Fix the typo? And special characters

In [407]:
# Replace the Word 'marxh' in the dataframe with 'march'
df['Word'] = df['Word'].replace('marxh', 'march')
df['Word'] = df['Word'].replace('naï¿½ve', 'naive')

print(len(df[df['Word'] == 'march']))
print(len(df[df['Word'] == 'naive']))

1
1


#### Drop data that we can't use

In [408]:
df = df.drop(['Contest number', 'Number of reported results', 'Number in hard mode'], axis=1)
df.head()

Unnamed: 0,Date,Word,1 try,2 tries,3 tries,4 tries,5 tries,6 tries,7 or more tries (X)
0,12/31/2022,manly,0,2,17,37,29,12,2
1,12/30/2022,molar,0,4,21,38,26,9,1
2,12/29/2022,havoc,0,2,16,38,30,12,2
3,12/28/2022,impel,0,3,21,40,25,9,1
4,12/27/2022,condo,0,2,17,35,29,14,3


#### Create feature columns for word

In [409]:
def vowels_consonants_ratio(word):
    vowels = 'aeiou'
    word = word.lower()
    num_vowels = sum(1 for char in word if char in vowels)
    num_consonants = sum(1 for char in word if char.isalpha() and char not in vowels)
    if num_consonants == 0:
        return 0  # Avoid division by zero
    return num_vowels / num_consonants

# Scrabble letter values
scrabble_values = {
    'a': 1,  'b': 3,  'c': 3,  'd': 2,  'e': 1,  'f': 4,  'g': 2,  'h': 4,
    'i': 1,  'j': 8,  'k': 5,  'l': 1,  'm': 3,  'n': 1,  'o': 1,  'p': 3,
    'q':10,  'r': 1,  's': 1,  't': 1,  'u': 1,  'v': 4,  'w': 4,  'x': 8,
    'y': 4,  'z':10
}

def scrabble_score(word):
    return sum(scrabble_values.get(char.lower(), 0) for char in word)

def letter_repetition_count(word):
    return len(word) - len(set(word.lower()))

def create_sentence(word):
    return f"The word is {word}."

def frequency_score(word):
    return zipf_frequency(word.lower(), 'en')

words_df = pd.read_csv('wordsv21.csv')
word_list = words_df['word'].tolist()
def word_permutations(word, num_diff_letters):
    count = 0
    for w in word_list:
        if sum(c1 != c2 for c1, c2 in zip(word.lower(), w.lower())) == num_diff_letters:
            count += 1
    return count


nltk.download('averaged_perceptron_tagger_eng')
def word_category(word):
    return nltk.pos_tag([word])[0][1]

common_bigrams = set(['th', 'he', 'in', 'er', 'an', 're', 'ed', 'on', 'es', 'st'])
def common_bigrams_count(word):
    word = word.lower()
    return sum(1 for i in range(len(word)-1) if word[i:i+2] in common_bigrams)

def create_features(my_df):
    my_df['Vowel consonant ratio'] = my_df['Word'].apply(vowels_consonants_ratio)
    my_df['Scrabble score'] = my_df['Word'].apply(scrabble_score)
    my_df['Letter repetition'] = my_df['Word'].apply(letter_repetition_count)
    my_df['Frequency score'] = my_df['Word'].apply(frequency_score)
    my_df['Words 1 away'] = my_df['Word'].apply(lambda row: word_permutations(row, 1)) # Words only one letter permutation away
    # my_df['Words 2 away'] = my_df['Word'].apply(lambda row: word_permutations(row, 2)) # Words only one letter permutation away
    my_df['Word category'] = my_df['Word'].apply(word_category)
    my_df['Common bigrams'] = my_df['Word'].apply(common_bigrams_count)

    my_df['Flesch_Reading_Ease'] = my_df['Word'].apply(lambda x: textstat.flesch_reading_ease(create_sentence(x)))
    my_df['Flesch_Kincaid_Grade'] = my_df['Word'].apply(lambda x: textstat.flesch_kincaid_grade(create_sentence(x)))
    my_df['Automated_Readability_Index'] = my_df['Word'].apply(lambda x: textstat.automated_readability_index(create_sentence(x)))
    my_df['Dale_Chall_Readability_Score'] = my_df['Word'].apply(lambda x: textstat.dale_chall_readability_score(create_sentence(x)))
    my_df['Difficult_Words'] = my_df['Word'].apply(lambda x: textstat.difficult_words(create_sentence(x)))

    my_df = pd.get_dummies(my_df, columns=['Word category'])
    return my_df

df = create_features(df)

df.head()

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\leviw\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


Unnamed: 0,Date,Word,1 try,2 tries,3 tries,4 tries,5 tries,6 tries,7 or more tries (X),Vowel consonant ratio,...,Word category_JJ,Word category_JJR,Word category_MD,Word category_NN,Word category_NNS,Word category_PRP$,Word category_RB,Word category_VB,Word category_VBG,Word category_VBN
0,12/31/2022,manly,0,2,17,37,29,12,2,0.25,...,False,False,False,False,False,False,True,False,False,False
1,12/30/2022,molar,0,4,21,38,26,9,1,0.666667,...,False,False,False,True,False,False,False,False,False,False
2,12/29/2022,havoc,0,2,16,38,30,12,2,0.666667,...,False,False,False,True,False,False,False,False,False,False
3,12/28/2022,impel,0,3,21,40,25,9,1,0.666667,...,False,False,False,True,False,False,False,False,False,False
4,12/27/2022,condo,0,2,17,35,29,14,3,0.666667,...,False,False,False,True,False,False,False,False,False,False


In [410]:
df[df['Word'] == 'mummy']['Letter repetition']

69    2
Name: Letter repetition, dtype: int64

#### Transform date column

In [411]:
def encode_date(my_df):
  my_df['Date'] = pd.to_datetime(my_df['Date'])

  my_df['Month'] = my_df['Date'].dt.month
  my_df['Day of week'] = my_df['Date'].dt.dayofweek  # Monday=0, Sunday=6
  my_df['Year'] = my_df['Date'].dt.year
  my_df['Day'] = my_df['Date'].dt.day

  my_df = pd.get_dummies(my_df, columns=['Month', 'Day of week', 'Day'])
  my_df = my_df.drop(['Date'], axis=1)

  return my_df

df = encode_date(df)
print(df.columns)

Index(['Word', '1 try', '2 tries', '3 tries', '4 tries', '5 tries', '6 tries',
       '7 or more tries (X)', 'Vowel consonant ratio', 'Scrabble score',
       'Letter repetition', 'Frequency score', 'Words 1 away',
       'Common bigrams', 'Flesch_Reading_Ease', 'Flesch_Kincaid_Grade',
       'Automated_Readability_Index', 'Dale_Chall_Readability_Score',
       'Difficult_Words', 'Word category_DT', 'Word category_JJ',
       'Word category_JJR', 'Word category_MD', 'Word category_NN',
       'Word category_NNS', 'Word category_PRP$', 'Word category_RB',
       'Word category_VB', 'Word category_VBG', 'Word category_VBN', 'Year',
       'Month_1', 'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6',
       'Month_7', 'Month_8', 'Month_9', 'Month_10', 'Month_11', 'Month_12',
       'Day of week_0', 'Day of week_1', 'Day of week_2', 'Day of week_3',
       'Day of week_4', 'Day of week_5', 'Day of week_6', 'Day_1', 'Day_2',
       'Day_3', 'Day_4', 'Day_5', 'Day_6', 'Day_7', 'Day_8', '

#### Create Model

In [412]:
X = df.drop(columns=['1 try', '2 tries', '3 tries', '4 tries', '5 tries', '6 tries', '7 or more tries (X)', 'Word'], axis=1)
y = df[['1 try', '2 tries', '3 tries', '4 tries', '5 tries', '6 tries', '7 or more tries (X)']]

# Save features for later encoding
feature_columns = X.columns.tolist()

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
xgb = XGBRegressor(objective='reg:squarederror', eval_metric='rmse')

# Wrap the model with MultiOutputRegressor
multi_output_model = MultiOutputRegressor(xgb)

# Train the model
multi_output_model.fit(X_train, y_train)

# Predict the distribution
y_pred = multi_output_model.predict(X_test)

# Convert predictions to percentages
y_pred_percentage = (y_pred.T / y_pred.sum(axis=1)).T * 100

root_mean_squared_error(y_test, y_pred)

4.954460058967142

In [413]:
import plotly.express as px
import pandas as pd

# Prepare data for plotting
results = pd.DataFrame({
    'Word': df.loc[y_test.index, 'Word'],
    'Actual_1_try': y_test.iloc[:, 0],
    'Predicted_1_try': y_pred[:, 0],
    'Actual_2_tries': y_test.iloc[:, 1],
    'Predicted_2_tries': y_pred[:, 1],
    'Actual_3_tries': y_test.iloc[:, 2],
    'Predicted_3_tries': y_pred[:, 2],
    'Actual_4_tries': y_test.iloc[:, 3],
    'Predicted_4_tries': y_pred[:, 3],
    'Actual_5_tries': y_test.iloc[:, 4],
    'Predicted_5_tries': y_pred[:, 4],
    'Actual_6_tries': y_test.iloc[:, 5],
    'Predicted_6_tries': y_pred[:, 5],
    'Actual_7_or_more_tries': y_test.iloc[:, 6],
    'Predicted_7_or_more_tries': y_pred[:, 6]
})

results['Total_Error'] = np.abs(results['Actual_1_try'] - results['Predicted_1_try']) + \
                         np.abs(results['Actual_2_tries'] - results['Predicted_2_tries']) + \
                         np.abs(results['Actual_3_tries'] - results['Predicted_3_tries']) + \
                         np.abs(results['Actual_4_tries'] - results['Predicted_4_tries']) + \
                         np.abs(results['Actual_5_tries'] - results['Predicted_5_tries']) + \
                         np.abs(results['Actual_6_tries'] - results['Predicted_6_tries']) + \
                         np.abs(results['Actual_7_or_more_tries'] - results['Predicted_7_or_more_tries'])

worst_results = results.nlargest(10, 'Total_Error')

worst_results_melted = worst_results.melt(id_vars='Word', var_name='Type', value_name='Value')

# Create a grouped bar plot
fig = px.bar(worst_results_melted, x='Word', y='Value', color='Type', barmode='group',
             title='Actual vs Predicted Values for the 10 Worst-Performing Predictions')

# Show plot
fig.show()

#### Feature Importances

In [414]:
# Show the model feature importances
importances = multi_output_model.estimators_[0].feature_importances_
importances_df = pd.DataFrame(importances, index=X.columns, columns=['Importance'])
importances_df = importances_df.sort_values('Importance', ascending=False)
importances_df

Unnamed: 0,Importance
Dale_Chall_Readability_Score,0.139943
Month_2,0.126057
Day_24,0.103385
Month_9,0.091947
Month_1,0.088537
...,...
Word category_PRP$,0.000000
Automated_Readability_Index,0.000000
Year,0.000000
Word category_JJR,0.000000


#### Predict for Eerie

In [415]:
eerie_df = pd.DataFrame({
  'Date': '2023-03-01',
  'Word': 'EERIE'
}, index=[0])

eerie_df = create_features(eerie_df)
eerie_df = encode_date(eerie_df)
eerie_df = eerie_df.drop(columns=['Word'], axis=1)

eerie_df = eerie_df.reindex(columns=feature_columns, fill_value=0)

output = multi_output_model.predict(eerie_df)
print(output)
print(output.sum())

[[ 0.17052296  8.650203   20.131477   30.767668   19.569504   15.9542885
   4.0361977 ]]
99.27986
