# Notebook Outline

In this notebook, puzzle generation history is loaded from csv file and new features are generated from the puzzle metadata.

In [157]:
import numpy as np
import pandas as pd

In [158]:
import os
current_dir = os.getcwd()
os.chdir(os.path.dirname(current_dir))
current_dir = os.getcwd()
current_dir

'/home/john/dev'

In [159]:
os.chdir('/home/john/dev/sudoku-python-tools')

In [160]:
df = pd.read_csv('inputs/csv_files/40_knowns_rib.csv')
df.head(3)
len(df)

784

Let's create a new feature - missing_digit_count. We can calculate it for each row by applying a function.

In [161]:
def missing_digits_count(puzzle_string):
    missing_digits = [n for n in range(1, 10) if str(n) not in puzzle_string]
    return len(missing_digits)

df['missing_digit_count'] = df['puzzle_string'].apply(missing_digits_count) 
df.head(3)

Unnamed: 0,known_count,unique_solution,puzzle_string,time_taken,missing_digit_count
0,40,False,-8962---44--9---2871---4--689---5-12-56--27--2...,0.01,0
1,40,False,729---316------72-316972--4----9-2472--4--961-...,0.02,2
2,40,True,--79---24--37----92-94-3---7-413-6--82--941-33...,0.04,0


Other useful features might be the number of empty nonets and the standard deviation of the count of known_cells in each nonet also.

In [162]:
from tools.constants import nonets

def empty_nonets_count(puzzle_string):
    ps = puzzle_string
    empty_nonets_count = 0
    for nonet in nonets:
        cells = [ps[idx] for idx in nonet if ps[idx] != '-']
        if not cells:
            empty_nonets_count += 1
    return empty_nonets_count

df['empty_nonets_count'] = df['puzzle_string'].apply(empty_nonets_count)
len(df[(df['empty_nonets_count'] > 0)])

18

In [163]:
from statistics import pstdev

def stddev_nonets_population(puzzle_string):
    ps = puzzle_string
    population_list = []
    for nonet in nonets:
        cells = [int(ps[idx]) for idx in nonet if ps[idx] != '-']
        population_list.append(len(cells))
    return pstdev(population_list)

df['stddev_nonet_population'] = df['puzzle_string'].apply(stddev_nonets_population)
df.head(3)

Unnamed: 0,known_count,unique_solution,puzzle_string,time_taken,missing_digit_count,empty_nonets_count,stddev_nonet_population
0,40,False,-8962---44--9---2871---4--689---5-12-56--27--2...,0.01,0,0,1.030402
1,40,False,729---316------72-316972--4----9-2472--4--961-...,0.02,2,0,1.685083
2,40,True,--79---24--37----92-94-3---7-413-6--82--941-33...,0.04,0,0,1.257079


Try adding the min, mean, stddev and max digit count

In [164]:
from collections import defaultdict
from statistics import fmean, pstdev

def get_digit_counts(puzzle_string):
    counts = defaultdict(int)
    for char in puzzle_string:
        if char.isnumeric():
            counts[int(char)] += 1
    return counts

def get_min_digit_count(puzzle_string):
    counts = get_digit_counts(puzzle_string)
    return min(counts.values())

def get_mean_digit_count(puzzle_string):
    counts = get_digit_counts(puzzle_string)
    return fmean(counts.values())

def get_stddev_digit_count(puzzle_string):
    counts = get_digit_counts(puzzle_string)
    return pstdev(counts.values())

def get_max_digit_count(puzzle_string):
    counts = get_digit_counts(puzzle_string)
    return max(counts.values())



df['min_digit_count'] = df['puzzle_string'].apply(get_min_digit_count)
df['mean_digit_count'] = df['puzzle_string'].apply(get_mean_digit_count)
df['stddev_digit_count'] = df['puzzle_string'].apply(get_stddev_digit_count)
df['max_digit_count'] = df['puzzle_string'].apply(get_max_digit_count)
df.head(3)

Unnamed: 0,known_count,unique_solution,puzzle_string,time_taken,missing_digit_count,empty_nonets_count,stddev_nonet_population,min_digit_count,mean_digit_count,stddev_digit_count,max_digit_count
0,40,False,-8962---44--9---2871---4--689---5-12-56--27--2...,0.01,0,0,1.030402,1,4.444444,2.006163,8
1,40,False,729---316------72-316972--4----9-2472--4--961-...,0.02,2,0,1.685083,5,5.714286,0.699854,7
2,40,True,--79---24--37----92-94-3---7-413-6--82--941-33...,0.04,0,0,1.257079,1,4.444444,2.114033,8


Try adding features for each nonet.

In [165]:
def get_nonet_pop_per_nonet(puzzle_string, nonet_index):
    ps = str(puzzle_string)
    count = 0
    for index in nonets[nonet_index]:
        if ps[index].isnumeric():
            count += 1
    return count
    
nonet_types = ['row', 'col', 'sqr']
for idx in range(27):
    nonet_type = nonet_types[idx // 9]
    nonet_id = idx % 9
    df[f'{nonet_type}_{nonet_id}'] = df['puzzle_string'].apply(get_nonet_pop_per_nonet, args=(idx,))
        

### Drop the puzzle_string column before correlation analysis

In [166]:
df.drop(['puzzle_string'], axis=1, inplace=True)
df.head(20)

Unnamed: 0,known_count,unique_solution,time_taken,missing_digit_count,empty_nonets_count,stddev_nonet_population,min_digit_count,mean_digit_count,stddev_digit_count,max_digit_count,...,col_8,sqr_0,sqr_1,sqr_2,sqr_3,sqr_4,sqr_5,sqr_6,sqr_7,sqr_8
0,40,False,0.01,0,0,1.030402,1,4.444444,2.006163,8,...,7,5,4,4,5,3,5,4,5,5
1,40,False,0.02,2,0,1.685083,5,5.714286,0.699854,7,...,6,6,3,6,2,3,7,2,5,6
2,40,True,0.04,0,0,1.257079,1,4.444444,2.114033,8,...,5,4,4,3,5,6,5,2,5,6
3,40,False,0.01,0,0,1.422916,1,4.444444,2.266231,7,...,4,4,4,6,7,5,3,6,1,4
4,40,False,0.03,1,0,1.448712,2,5.0,1.5,7,...,6,5,4,4,6,4,8,2,4,3
5,40,False,0.03,0,0,1.594744,1,4.444444,1.571348,7,...,2,2,6,2,5,6,7,5,5,2
6,40,False,0.04,0,0,1.257079,2,4.444444,1.571348,7,...,3,4,5,4,6,5,4,6,2,4
7,40,False,0.05,0,0,0.955814,2,4.444444,2.006163,8,...,3,4,4,3,4,5,5,5,4,6
8,40,False,0.07,1,0,1.227262,3,5.0,1.5,8,...,3,4,4,3,7,5,4,2,7,4
9,40,True,0.09,0,0,1.523479,1,4.444444,1.640536,6,...,1,4,3,4,3,4,4,8,5,5


### Run correlation analysis

In [167]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 784 entries, 0 to 783
Data columns (total 37 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   known_count              784 non-null    int64  
 1   unique_solution          784 non-null    bool   
 2   time_taken               784 non-null    float64
 3   missing_digit_count      784 non-null    int64  
 4   empty_nonets_count       784 non-null    int64  
 5   stddev_nonet_population  784 non-null    float64
 6   min_digit_count          784 non-null    int64  
 7   mean_digit_count         784 non-null    float64
 8   stddev_digit_count       784 non-null    float64
 9   max_digit_count          784 non-null    int64  
 10  row_0                    784 non-null    int64  
 11  row_1                    784 non-null    int64  
 12  row_2                    784 non-null    int64  
 13  row_3                    784 non-null    int64  
 14  row_4                    7

In [170]:
corr_spearman = df.corr(method='spearman')['unique_solution'].sort_values(key=abs, ascending=False)[1:]
corr_spearman

stddev_digit_count        -0.161982
max_digit_count           -0.155649
missing_digit_count       -0.101093
mean_digit_count          -0.101093
stddev_nonet_population   -0.072933
row_7                      0.072325
sqr_7                      0.064712
col_2                      0.063535
sqr_3                      0.051505
min_digit_count            0.049926
col_1                     -0.049410
col_4                      0.048492
sqr_0                     -0.048471
time_taken                 0.047675
col_7                     -0.043478
row_1                     -0.042980
sqr_5                     -0.039206
sqr_6                      0.026714
col_5                      0.024519
sqr_8                     -0.023709
row_0                      0.022971
col_8                     -0.021021
row_6                     -0.020330
row_2                     -0.019880
col_0                      0.019009
row_8                      0.017143
row_3                     -0.015077
col_6                     -0

In [169]:
corr_pearson = df.corr(method='pearson')['unique_solution'].sort_values(key=abs, ascending=False)[1:]
corr_pearson

stddev_digit_count        -0.162026
max_digit_count           -0.143814
mean_digit_count          -0.104541
missing_digit_count       -0.103814
stddev_nonet_population   -0.082249
sqr_0                     -0.061838
time_taken                 0.060816
row_7                      0.058786
sqr_7                      0.057349
col_1                     -0.055875
Name: unique_solution, dtype: float64