## Exploratory Data Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import datetime as dt

In [2]:
sudoku = pd.read_csv("sudoku_counts.csv")

In [3]:
sudoku["Age"] = int(dt.datetime.today().strftime("%Y")) - sudoku["Birth_Year"]
sudoku["Gender_numeric"] = sudoku["Gender"].map({"m":1, "f":0})
sudoku["Average_time"] = sudoku["Total Time (s) for all Puzzles Solved"]/sudoku["Puzzles Solved"]

In [4]:
sudoku = sudoku.drop(sudoku[sudoku.Age > 90].index)
sudoku = sudoku.drop(sudoku[sudoku.Age < 6].index)
sudoku = sudoku.dropna()

In [5]:
sudoku.head()

Unnamed: 0,User ID,Puzzle Level,Best Time (s),Puzzles Solved,Puzzle Solved w/o Errors,Puzzle Solved w/ Errors,Total Time (s) for all Puzzles Solved,Total of (time squared) for all puzzles solved,Birth_Year,Country,US_State,Gender,Age,Gender_numeric,Average_time
0,1,1,506,1,1,0,506,256036,1976,Israel,0,m,43,1.0,506.0
1,2,1,79,12,11,1,2307,511683,1950,Israel,0,f,69,0.0,192.25
2,2,2,65,2,1,1,131,8581,1950,Israel,0,f,69,0.0,65.5
3,2,4,294,2,2,0,1556,1679080,1950,Israel,0,f,69,0.0,778.0
7,5,1,257,119,35,21,72691,53643653,1949,United States,FL,m,70,1.0,610.848739


In [13]:
sudoku.shape

(183212, 15)

In [8]:
avg_age = sudoku.groupby("Country")["Age"].mean()

In [11]:
avg_age[["United States","India","United Kingdom","Canada","Australia"]]

Country
United States     47.623396
India             40.025969
United Kingdom    54.335847
Canada            51.554291
Australia         52.430777
Name: Age, dtype: float64

In [12]:
median_puzzles = sudoku.groupby("Country")["Puzzles Solved"].median()

In [13]:
median_puzzles[["United States","India","United Kingdom","Canada","Australia"]]

Country
United States     5.0
India             3.0
United Kingdom    5.0
Canada            6.0
Australia         5.0
Name: Puzzles Solved, dtype: float64

### 1. Patterns for Levels

## a) Geographic Location of players

In [23]:
## Level 1
lvl1 = sudoku[sudoku["Puzzle Level"]==1]

In [24]:
geo1 = lvl1.groupby("Country")["User ID"].count()

In [25]:
geo1 = pd.DataFrame(geo1.sort_values(ascending = False).head())

In [26]:
geo1.reset_index()

Unnamed: 0,Country,User ID
0,United States,38117
1,India,8976
2,United Kingdom,7511
3,Canada,3987
4,Australia,2002


In [27]:
lvl1.head()

Unnamed: 0,User ID,Puzzle Level,Best Time (s),Puzzles Solved,Puzzle Solved w/o Errors,Puzzle Solved w/ Errors,Total Time (s) for all Puzzles Solved,Total of (time squared) for all puzzles solved,Birth_Year,Country,US_State,Gender,Age,Gender_numeric,Average_time
0,1,1,506,1,1,0,506,256036,1976,Israel,0,m,43,1.0,506.0
1,2,1,79,12,11,1,2307,511683,1950,Israel,0,f,69,0.0,192.25
7,5,1,257,119,35,21,72691,53643653,1949,United States,FL,m,70,1.0,610.848739
13,8,1,156,850,785,65,253451,115729295,1963,India,0,m,56,1.0,298.177647
22,13,1,234,1,0,0,234,54756,1974,United States,VA,m,45,1.0,234.0


In [12]:
avg_age1 = lvl1.groupby("Country")["Age"].mean()

In [13]:
avg_age1 = pd.Series(avg_age1.loc[["United States", "India", "United Kingdom", "Canada", "Australia"]])

In [14]:
avg_time1 = lvl1.groupby("Country")["Average_time"].mean()

In [15]:
avg_time1 = pd.Series(avg_time1.loc[["United States", "India", "United Kingdom", "Canada", "Australia"]])

In [16]:
avg_puzzle_solved1 = lvl1.groupby("Country")["Puzzles Solved"].mean()

In [17]:
avg_puzzle_solved1 = pd.Series(avg_puzzle_solved1.loc\
                               [["United States", "India", "United Kingdom", "Canada", "Australia"]])

In [18]:
geo1["Average_Age"] = avg_age1

In [19]:
geo1["Average_Time"] = avg_time1

In [20]:
geo1["Average_Puzzle_Solved"] = avg_puzzle_solved1

In [21]:
geo1.rename(columns = {"User ID":"Num_User"}, inplace = True)
geo1.reset_index()

Unnamed: 0,Country,Num_User,Average_Age,Average_Time,Average_Puzzle_Solved
0,United States,38117,45.568119,805.331949,58.883543
1,India,8976,36.938837,812.017469,19.912656
2,United Kingdom,7511,51.305552,752.732962,57.203834
3,Canada,3987,49.731628,777.253922,56.398294
4,Australia,2002,50.343157,800.000598,75.063936


In [22]:
## Level 2

In [28]:
lvl2 = sudoku[sudoku["Puzzle Level"]==2]

In [29]:
geo2 = lvl2.groupby("Country")["User ID"].count()

In [30]:
geo2 = pd.DataFrame(geo2.sort_values(ascending = False).head())

In [31]:
geo2.reset_index()

Unnamed: 0,Country,User ID
0,United States,18288
1,United Kingdom,4195
2,India,4087
3,Canada,2316
4,Australia,921


In [32]:
avg_age2 = lvl2.groupby("Country")["Age"].mean()

In [28]:
avg_age2 = pd.Series(avg_age2.loc[["United States", "United Kingdom", "India", "Canada", "Australia"]])

In [29]:
avg_time2 = lvl2.groupby("Country")["Average_time"].mean()

In [30]:
avg_time2 = pd.Series(avg_time2.loc[["United States", "United Kingdom", "India", "Canada", "Australia"]])

In [31]:
avg_puzzle_solved2 = lvl2.groupby("Country")["Puzzles Solved"].mean()

In [32]:
avg_puzzle_solved2 = pd.Series(avg_puzzle_solved2.loc[["United States", "United Kingdom", "India", "Canada", "Australia"]])

In [33]:
geo2["Average_Age"] = avg_age2
geo2["Average_Time"] = avg_time2
geo2["Average_Puzzle_Solved"] = avg_puzzle_solved2

In [34]:
geo2.rename(columns = {"User ID":"Num_User"}, inplace = True)
geo2.reset_index()

Unnamed: 0,Country,Num_User,Average_Age,Average_Time,Average_Puzzle_Solved
0,United States,18288,49.24825,1047.311604,77.674486
1,United Kingdom,4195,56.329678,1046.702995,82.688915
2,India,4087,40.101541,1093.00619,34.78713
3,Canada,2316,53.130397,1055.801433,119.581174
4,Australia,921,54.024973,1067.483457,120.720955


In [35]:
## Level 3

In [33]:
lvl3 = sudoku[sudoku["Puzzle Level"]==3]

In [34]:
geo3 = lvl3.groupby("Country")["User ID"].count()

In [35]:
geo3 = pd.DataFrame(geo3.sort_values(ascending = False).head())

In [36]:
geo3.reset_index()

Unnamed: 0,Country,User ID
0,United States,11766
1,India,3591
2,United Kingdom,2785
3,Canada,1608
4,Australia,705


In [37]:
avg_age3 = lvl3.groupby("Country")["Age"].mean()

In [41]:
avg_age3 = pd.Series(avg_age3.loc[["United States", "India", "United Kingdom", "Canada", "Australia"]])

In [42]:
avg_time3 = lvl3.groupby("Country")["Average_time"].mean()

In [43]:
avg_time3 = pd.Series(avg_time3.loc[["United States", "India", "United Kingdom", "Canada", "Australia"]])

In [44]:
avg_puzzle_solved3 = lvl3.groupby("Country")["Puzzles Solved"].mean()

In [45]:
avg_puzzle_solved3 = pd.Series(avg_puzzle_solved3.loc\
                               [["United States", "India", "United Kingdom", "Canada", "Australia"]])

In [46]:
geo3["Average_Age"] = avg_age3
geo3["Average_Time"] = avg_time3
geo3["Average_Puzzle_Solved"] = avg_puzzle_solved3

In [47]:
geo3.rename(columns = {"User ID":"Num_User"}, inplace = True)
geo3.reset_index()

Unnamed: 0,Country,Num_User,Average_Age,Average_Time,Average_Puzzle_Solved
0,United States,11766,51.002635,1248.259823,120.91331
1,India,3591,43.231412,1367.568223,62.602339
2,United Kingdom,2785,58.185278,1248.52081,111.584201
3,Canada,1608,53.710199,1202.987657,128.102612
4,Australia,705,54.885106,1284.912043,106.520567


In [48]:
## Level 4

In [38]:
lvl4 = sudoku[sudoku["Puzzle Level"]==4]

In [39]:
geo4 = lvl4.groupby("Country")["User ID"].count()

In [40]:
geo4 = pd.DataFrame(geo4.sort_values(ascending = False).head())

In [41]:
geo4.reset_index()

Unnamed: 0,Country,User ID
0,United States,9279
1,India,2523
2,United Kingdom,1951
3,Canada,1142
4,Australia,518


In [42]:
avg_age4 = lvl4.groupby("Country")["Age"].mean()

In [54]:
avg_age4 = pd.Series(avg_age4.loc[["United States", "India", "United Kingdom", "Canada", "Australia"]])

In [55]:
avg_time4 = lvl4.groupby("Country")["Average_time"].mean()

In [56]:
avg_time4 = pd.Series(avg_time4.loc[["United States", "India", "United Kingdom", "Canada", "Australia"]])

In [57]:
avg_puzzle_solved4 = lvl4.groupby("Country")["Puzzles Solved"].mean()

In [58]:
avg_puzzle_solved4 = pd.Series(avg_puzzle_solved4.loc\
                               [["United States", "India", "United Kingdom", "Canada", "Australia"]])

In [59]:
geo4["Average_Age"] = avg_age4
geo4["Average_Time"] = avg_time4
geo4["Average_Puzzle_Solved"] = avg_puzzle_solved4

In [60]:
geo4.rename(columns = {"User ID":"Num_User"}, inplace = True)
geo4.reset_index()

Unnamed: 0,Country,Num_User,Average_Age,Average_Time,Average_Puzzle_Solved
0,United States,9279,48.578834,1387.535303,155.971118
1,India,2523,46.324217,1555.824674,169.327784
2,United Kingdom,1951,56.219887,1415.151887,150.276269
3,Canada,1142,51.685639,1370.606031,185.900175
4,Australia,518,54.324324,1404.137696,161.53861


## How many players completed less than 5 of the puzzles?

In [14]:
sudoku.head()

Unnamed: 0,User ID,Puzzle Level,Best Time (s),Puzzles Solved,Puzzle Solved w/o Errors,Puzzle Solved w/ Errors,Total Time (s) for all Puzzles Solved,Total of (time squared) for all puzzles solved,Birth_Year,Country,US_State,Gender,Age,Gender_numeric,Average_time
0,1,1,506,1,1,0,506,256036,1976,Israel,0,m,43,1.0,506.0
1,2,1,79,12,11,1,2307,511683,1950,Israel,0,f,69,0.0,192.25
2,2,2,65,2,1,1,131,8581,1950,Israel,0,f,69,0.0,65.5
3,2,4,294,2,2,0,1556,1679080,1950,Israel,0,f,69,0.0,778.0
7,5,1,257,119,35,21,72691,53643653,1949,United States,FL,m,70,1.0,610.848739


In [15]:
## First, add up the number of puzzles solved by player
num_puzzle = sudoku.groupby("User ID")["Puzzles Solved"].sum()

In [16]:
num_puzzle.head()

User ID
1       1
2      16
5     218
7       7
8    3325
Name: Puzzles Solved, dtype: int64

In [17]:
num_puzzle.shape

(113058,)

In [18]:
## Next, determine how many players solved <=5 puzzles
num_puzzle[num_puzzle<=5].shape

(52195,)

In [19]:
num_puzzle[num_puzzle<=5].head()

User ID
1     1
13    1
17    2
18    3
25    4
Name: Puzzles Solved, dtype: int64

In [20]:
less5_completion = {"Total players": num_puzzle.shape[0], \
                           "Player completing <=5 puzzles":num_puzzle[num_puzzle<=5].shape[0] }

In [21]:
str(round(num_puzzle[num_puzzle<=5].shape[0] / num_puzzle.shape[0] * 100,2)) + str("%")

'46.17%'

## How many players for each solved <5 puzzles?

In [69]:
## Recall we separated the players by level and gave it variables
### lvl1, lvl2, lvl3, lvl4

In [43]:
lvl1.head()

Unnamed: 0,User ID,Puzzle Level,Best Time (s),Puzzles Solved,Puzzle Solved w/o Errors,Puzzle Solved w/ Errors,Total Time (s) for all Puzzles Solved,Total of (time squared) for all puzzles solved,Birth_Year,Country,US_State,Gender,Age,Gender_numeric,Average_time
0,1,1,506,1,1,0,506,256036,1976,Israel,0,m,43,1.0,506.0
1,2,1,79,12,11,1,2307,511683,1950,Israel,0,f,69,0.0,192.25
7,5,1,257,119,35,21,72691,53643653,1949,United States,FL,m,70,1.0,610.848739
13,8,1,156,850,785,65,253451,115729295,1963,India,0,m,56,1.0,298.177647
22,13,1,234,1,0,0,234,54756,1974,United States,VA,m,45,1.0,234.0


In [45]:
lvl1_puzzle = lvl1.groupby("User ID")["Puzzles Solved"].sum()

In [46]:
lvl1_puzzle.shape

(84772,)

In [47]:
lvl1_puzzle[lvl1_puzzle <= 5].shape

(49896,)

In [48]:
print("The % of players who completed <=5 puzzles for lvl1 is {}"\
      .format(str(round(lvl1_puzzle[lvl1_puzzle <= 5].shape[0]/lvl1_puzzle.shape[0]*100,2)) + str("%")))

The % of players who completed <=5 puzzles for lvl1 is 58.86%


In [49]:
lvl2_puzzle = lvl2.groupby("User ID")["Puzzles Solved"].sum()

In [50]:
print("The % of players who completed <=5 puzzles for lvl2 is {}"\
      .format(str(round(lvl2_puzzle[lvl2_puzzle <= 5].shape[0]/lvl2_puzzle.shape[0]*100,2)) + str("%")))

The % of players who completed <=5 puzzles for lvl2 is 53.07%


In [52]:
lvl3_puzzle = lvl3.groupby("User ID")["Puzzles Solved"].sum()

In [53]:
print("The % of players who completed <=5 puzzles for lvl3 is {}"\
      .format(str(round(lvl3_puzzle[lvl3_puzzle <= 5].shape[0]/lvl3_puzzle.shape[0]*100,2)) + str("%")))

The % of players who completed <=5 puzzles for lvl3 is 47.84%


In [54]:
lvl4_puzzle = lvl4.groupby("User ID")["Puzzles Solved"].sum()

In [55]:
print("The % of players who completed <=5 puzzles for lvl4 is {}"\
      .format(str(round(lvl4_puzzle[lvl4_puzzle <= 5].shape[0]/lvl4_puzzle.shape[0]*100,2)) + str("%")))

The % of players who completed <=5 puzzles for lvl4 is 48.43%


## How many users are users who attempted multiple levels?

In [81]:
sudoku.head()

Unnamed: 0,User ID,Puzzle Level,Best Time (s),Puzzles Solved,Puzzle Solved w/o Errors,Puzzle Solved w/ Errors,Total Time (s) for all Puzzles Solved,Total of (time squared) for all puzzles solved,Birth_Year,Country,US_State,Gender,Age,Gender_numeric,Average_time
0,1,1,506,1,1,0,506,256036,1976,Israel,0,m,43,1.0,506.0
1,2,1,79,12,11,1,2307,511683,1950,Israel,0,f,69,0.0,192.25
2,2,2,65,2,1,1,131,8581,1950,Israel,0,f,69,0.0,65.5
3,2,4,294,2,2,0,1556,1679080,1950,Israel,0,f,69,0.0,778.0
7,5,1,257,119,35,21,72691,53643653,1949,United States,FL,m,70,1.0,610.848739


In [82]:
users = sudoku.groupby("User ID")["User ID"].count()

In [83]:
users.head()

User ID
1    1
2    3
5    4
7    1
8    4
Name: User ID, dtype: int64

In [84]:
users.shape

(113058,)

In [85]:
recurring_users = users[users>1]

In [86]:
recurring_users.shape

(43473,)

In [87]:
print("The % of players who are recurring users is {}"\
      .format(str(round(recurring_users.shape[0]/users.shape[0]*100,2)) + str("%")))

The % of players who are recurring users is 38.45%


In [88]:
## How many including those who didn't populate all info?
raw_data = pd.read_csv("sudoku_counts.csv")

In [89]:
raw_data.head()

Unnamed: 0,User ID,Puzzle Level,Best Time (s),Puzzles Solved,Puzzle Solved w/o Errors,Puzzle Solved w/ Errors,Total Time (s) for all Puzzles Solved,Total of (time squared) for all puzzles solved,Birth_Year,Country,US_State,Gender
0,1,1,506,1,1,0,506,256036,1976,Israel,0,m
1,2,1,79,12,11,1,2307,511683,1950,Israel,0,f
2,2,2,65,2,1,1,131,8581,1950,Israel,0,f
3,2,4,294,2,2,0,1556,1679080,1950,Israel,0,f
4,3,1,190,1,1,0,190,36100,0,,TX,0


In [90]:
all_users = raw_data.groupby("User ID")["User ID"].count()

In [91]:
all_recurring_users = all_users[all_users>1]

In [92]:
print("The % of players who are recurring users is {}"\
      .format(str(round(all_recurring_users.shape[0]/all_users.shape[0]*100,2)) + str("%")))

The % of players who are recurring users is 37.03%


## 1. Who is the most valuable customer we want to solicit and retain

In [93]:
## Determine avg total play time by country. 
### To make this more indicative, we should remove the entries where user solved <5 puzzles.
sudoku.head()

Unnamed: 0,User ID,Puzzle Level,Best Time (s),Puzzles Solved,Puzzle Solved w/o Errors,Puzzle Solved w/ Errors,Total Time (s) for all Puzzles Solved,Total of (time squared) for all puzzles solved,Birth_Year,Country,US_State,Gender,Age,Gender_numeric,Average_time
0,1,1,506,1,1,0,506,256036,1976,Israel,0,m,43,1.0,506.0
1,2,1,79,12,11,1,2307,511683,1950,Israel,0,f,69,0.0,192.25
2,2,2,65,2,1,1,131,8581,1950,Israel,0,f,69,0.0,65.5
3,2,4,294,2,2,0,1556,1679080,1950,Israel,0,f,69,0.0,778.0
7,5,1,257,119,35,21,72691,53643653,1949,United States,FL,m,70,1.0,610.848739


In [94]:
puzzles_over5 = pd.DataFrame(sudoku.groupby(["User ID", "Country", "Gender", "Age"])\
                             [["Puzzles Solved","Total Time (s) for all Puzzles Solved"]].sum().reset_index())

In [95]:
puzzles_over5.head()

Unnamed: 0,User ID,Country,Gender,Age,Puzzles Solved,Total Time (s) for all Puzzles Solved
0,1,Israel,m,43,1,506
1,2,Israel,f,69,16,3994
2,5,United States,m,70,218,215525
3,7,United States,f,61,7,7781
4,8,India,m,56,3325,1914368


In [96]:
puzzles_over5 = puzzles_over5[puzzles_over5["Puzzles Solved"] > 5]

In [97]:
puzzles_over5.head()

Unnamed: 0,User ID,Country,Gender,Age,Puzzles Solved,Total Time (s) for all Puzzles Solved
1,2,Israel,f,69,16,3994
2,5,United States,m,70,218,215525
3,7,United States,f,61,7,7781
4,8,India,m,56,3325,1914368
5,11,United States,m,76,977,1790980


In [98]:
avg_time_by_country = puzzles_over5.groupby("Country").sum()

In [99]:
avg_time_by_country.head()

Unnamed: 0_level_0,User ID,Age,Puzzles Solved,Total Time (s) for all Puzzles Solved
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,3332770,980,1883,2348608
Albania,8782780,3434,9041,7632611
Algeria,3206189,1122,1149,1062601
American Samoa,3185005,786,1388,1480520
Andorra,853764,280,745,599408


In [100]:
avg_time_by_country["Average_time_per_puzzle"] = avg_time_by_country["Total Time (s) for all Puzzles Solved"]/\
                                            avg_time_by_country["Puzzles Solved"]

In [101]:
avg_time_by_country.sort_values("Average_time_per_puzzle", ascending = False).head(10)
## These countries hae the highest average time per puzzle. Therefore, want to decrease the level of difficulty.

Unnamed: 0_level_0,User ID,Age,Puzzles Solved,Total Time (s) for all Puzzles Solved,Average_time_per_puzzle
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Lesotho,66605,42,7,21962,3137.428571
Antigua and Barbuda,505294,301,1799,3815106,2120.68149
Eritrea,67110,39,6,12657,2109.5
Madagascar,195982,72,69,137802,1997.130435
Liberia,37904,23,9,16390,1821.111111
Rwanda,273978,72,72,129625,1800.347222
Holy See (Vatican City State),438138,80,34,52252,1536.823529
Cuba,787566,436,377,553105,1467.122016
Cape Verde,468753,198,192,268858,1400.302083
Burkina Faso,334128,96,145,196633,1356.089655


In [102]:
avg_time_by_country.sort_values(["Puzzles Solved","Average_time_per_puzzle"], ascending = [False,True]).head(10)

Unnamed: 0_level_0,User ID,Age,Puzzles Solved,Total Time (s) for all Puzzles Solved,Average_time_per_puzzle
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
United States,2966794169,1319076,6482603,4059477593,626.211044
United Kingdom,674276234,310738,1369158,871020231,636.172181
India,636669021,232281,957370,763570602,797.571056
Canada,355886128,170613,914624,584260322,638.798372
Australia,187375523,78828,417609,244920593,586.483033
Malaysia,104261119,43390,238436,121735486,510.55833
Netherlands,41223128,21562,197302,109665439,555.825278
Romania,63715133,28442,160420,114489130,713.683643
Egypt,78041196,34852,140143,109355678,780.31495
Philippines,69353185,28755,138926,95103125,684.559586


In [103]:
avg_time_by_country.sort_values("Average_time_per_puzzle", ascending = True).head(10)

Unnamed: 0_level_0,User ID,Age,Puzzles Solved,Total Time (s) for all Puzzles Solved,Average_time_per_puzzle
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Korea, Democratic People's Republic of",109906,63,90,15405,171.166667
"Tanzania, United Republic of",773623,397,11701,2461402,210.35826
Falkland Islands (Malvinas),324360,131,1151,256781,223.093831
Lithuania,12519501,4643,111353,35363932,317.584008
Mozambique,377593,165,1260,416060,330.206349
Niger,230244,17,6,2094,349.0
French Polynesia,103606,39,361,132908,368.166205
Gambia,165950,19,28,11326,404.5
Montserrat,421304,87,725,297739,410.674483
Saint Barth√©lemy,244937,158,224,98049,437.71875


## Regression Analysis

In [104]:
sudoku.head()

Unnamed: 0,User ID,Puzzle Level,Best Time (s),Puzzles Solved,Puzzle Solved w/o Errors,Puzzle Solved w/ Errors,Total Time (s) for all Puzzles Solved,Total of (time squared) for all puzzles solved,Birth_Year,Country,US_State,Gender,Age,Gender_numeric,Average_time
0,1,1,506,1,1,0,506,256036,1976,Israel,0,m,43,1.0,506.0
1,2,1,79,12,11,1,2307,511683,1950,Israel,0,f,69,0.0,192.25
2,2,2,65,2,1,1,131,8581,1950,Israel,0,f,69,0.0,65.5
3,2,4,294,2,2,0,1556,1679080,1950,Israel,0,f,69,0.0,778.0
7,5,1,257,119,35,21,72691,53643653,1949,United States,FL,m,70,1.0,610.848739


In [105]:
sudoku.shape

(183212, 15)

In [106]:
gender_dummies = pd.get_dummies(sudoku.Gender)

In [107]:
sudoku_total_time = pd.DataFrame(sudoku["Total Time (s) for all Puzzles Solved"])
sudoku_total_time = sudoku_total_time.join(sudoku.Age).join(gender_dummies)
sudoku_total_time.head()

Unnamed: 0,Total Time (s) for all Puzzles Solved,Age,f,m
0,506,43,0,1
1,2307,69,1,0
2,131,69,1,0
3,1556,69,1,0
7,72691,70,0,1


In [108]:
Y = sudoku["Total Time (s) for all Puzzles Solved"]
X = sudoku_total_time.drop("Total Time (s) for all Puzzles Solved", axis = 1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .20, random_state = 40)
regr = linear_model.LinearRegression() 
regr.fit(X_train, Y_train)
predicted = regr.predict(X_test)
predicted

array([ 19047.17410042,  12510.77619298,  19047.17410042, ...,
        50410.1252578 , 108808.79030658,  23036.57926514])

In [109]:
X1 = sm.add_constant(X_train)
model = sm.OLS(Y_train, X1).fit() 

  return ptp(axis=axis, out=out, **kwargs)


In [110]:
model.summary()

0,1,2,3
Dep. Variable:,Total Time (s) for all Puzzles Solved,R-squared:,0.018
Model:,OLS,Adj. R-squared:,0.018
Method:,Least Squares,F-statistic:,1327.0
Date:,"Sat, 09 Nov 2019",Prob (F-statistic):,0.0
Time:,15:08:40,Log-Likelihood:,-2051700.0
No. Observations:,146569,AIC:,4103000.0
Df Residuals:,146566,BIC:,4103000.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.804e+04,1348.193,-20.802,0.000,-3.07e+04,-2.54e+04
Age,1994.7026,40.494,49.259,0.000,1915.335,2074.070
f,-2.128e+04,1036.124,-20.538,0.000,-2.33e+04,-1.92e+04
m,-6764.8625,1043.880,-6.480,0.000,-8810.847,-4718.878

0,1,2,3
Omnibus:,351286.152,Durbin-Watson:,2.005
Prob(Omnibus):,0.0,Jarque-Bera (JB):,9083005529.784
Skew:,24.408,Prob(JB):,0.0
Kurtosis:,1221.572,Cond. No.,2.31e+17


In [152]:
## Does puzzles solved have relationship with age and gender
sudoku_puzzle_solved = pd.DataFrame(sudoku["Puzzles Solved"])
sudoku_puzzle_solved = sudoku_puzzle_solved.join(sudoku.Age).join(gender_dummies)
sudoku_puzzle_solved.head()

Unnamed: 0,Puzzles Solved,Age,f,m
0,1,43,0,1
1,12,69,1,0
2,2,69,1,0
3,2,69,1,0
7,119,70,0,1


In [153]:
Y = sudoku_puzzle_solved["Puzzles Solved"]
X = sudoku_puzzle_solved.drop("Puzzles Solved", axis = 1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .20, random_state = 40)
regr = linear_model.LinearRegression() 

regr.fit(X_train, Y_train)
predicted = regr.predict(X_test)
predicted

array([ 32.25464674,  28.00410743,  32.25464674, ...,  80.9291185 ,
       157.60335716,  37.82570054])

In [154]:
X1 = sm.add_constant(X_train)
model = sm.OLS(Y_train, X1).fit() 

  return ptp(axis=axis, out=out, **kwargs)


In [155]:
model.summary()

0,1,2,3
Dep. Variable:,Puzzles Solved,R-squared:,0.01
Model:,OLS,Adj. R-squared:,0.01
Method:,Least Squares,F-statistic:,726.4
Date:,"Sat, 09 Nov 2019",Prob (F-statistic):,1.27e-314
Time:,15:38:22,Log-Likelihood:,-1130600.0
No. Observations:,146569,AIC:,2261000.0
Df Residuals:,146566,BIC:,2261000.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-33.7673,2.515,-13.426,0.000,-38.697,-28.838
Age,2.7855,0.076,36.873,0.000,2.637,2.934
f,-24.5800,1.933,-12.716,0.000,-28.369,-20.791
m,-9.1873,1.947,-4.718,0.000,-13.004,-5.370

0,1,2,3
Omnibus:,433299.287,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,93661090560.363
Skew:,41.715,Prob(JB):,0.0
Kurtosis:,3918.305,Cond. No.,2.31e+17


In [115]:
## Is average completion time a significant factor?
sudoku_puzzle_solved = pd.DataFrame(sudoku["Puzzles Solved"])
sudoku_puzzle_solved = sudoku_puzzle_solved.join(sudoku.Age).join(gender_dummies).join(sudoku.Average_time)
sudoku_puzzle_solved.head()

Unnamed: 0,Puzzles Solved,Age,f,m,Average_time
0,1,43,0,1,506.0
1,12,69,1,0,192.25
2,2,69,1,0,65.5
3,2,69,1,0,778.0
7,119,70,0,1,610.848739


In [116]:
Y = sudoku["Puzzles Solved"]
X = sudoku_puzzle_solved.drop("Puzzles Solved", axis = 1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .20, random_state = 40)
regr = linear_model.LinearRegression() 

regr.fit(X_train, Y_train)
predicted = regr.predict(X_test)
predicted

array([ 54.44752215,  46.11022997,  31.21352521, ...,  91.08867912,
       126.64161596, -29.74151326])

In [117]:
X1 = sm.add_constant(X_train)
model = sm.OLS(Y_train, X1).fit() 

  return ptp(axis=axis, out=out, **kwargs)


In [118]:
model.summary()

0,1,2,3
Dep. Variable:,Puzzles Solved,R-squared:,0.016
Model:,OLS,Adj. R-squared:,0.016
Method:,Least Squares,F-statistic:,778.3
Date:,"Sat, 09 Nov 2019",Prob (F-statistic):,0.0
Time:,15:08:40,Log-Likelihood:,-1130200.0
No. Observations:,146569,AIC:,2260000.0
Df Residuals:,146565,BIC:,2260000.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-3.6001,2.707,-1.330,0.184,-8.907,1.706
Age,2.9462,0.076,39.014,0.000,2.798,3.094
f,-11.4161,1.978,-5.771,0.000,-15.293,-7.539
m,7.8160,2.025,3.860,0.000,3.847,11.785
Average_time,-0.0525,0.002,-29.556,0.000,-0.056,-0.049

0,1,2,3
Omnibus:,434056.571,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,95376235237.784
Skew:,41.914,Prob(JB):,0.0
Kurtosis:,3953.999,Cond. No.,6.01e+18


## I want to see if the diff between avg and best time contributes to puzzles solved

In [119]:
sudoku.head()

Unnamed: 0,User ID,Puzzle Level,Best Time (s),Puzzles Solved,Puzzle Solved w/o Errors,Puzzle Solved w/ Errors,Total Time (s) for all Puzzles Solved,Total of (time squared) for all puzzles solved,Birth_Year,Country,US_State,Gender,Age,Gender_numeric,Average_time
0,1,1,506,1,1,0,506,256036,1976,Israel,0,m,43,1.0,506.0
1,2,1,79,12,11,1,2307,511683,1950,Israel,0,f,69,0.0,192.25
2,2,2,65,2,1,1,131,8581,1950,Israel,0,f,69,0.0,65.5
3,2,4,294,2,2,0,1556,1679080,1950,Israel,0,f,69,0.0,778.0
7,5,1,257,119,35,21,72691,53643653,1949,United States,FL,m,70,1.0,610.848739


In [120]:
sudoku["Diff_avg_best"] = sudoku["Average_time"] - sudoku["Best Time (s)"]

In [121]:
sudoku.head()

Unnamed: 0,User ID,Puzzle Level,Best Time (s),Puzzles Solved,Puzzle Solved w/o Errors,Puzzle Solved w/ Errors,Total Time (s) for all Puzzles Solved,Total of (time squared) for all puzzles solved,Birth_Year,Country,US_State,Gender,Age,Gender_numeric,Average_time,Diff_avg_best
0,1,1,506,1,1,0,506,256036,1976,Israel,0,m,43,1.0,506.0,0.0
1,2,1,79,12,11,1,2307,511683,1950,Israel,0,f,69,0.0,192.25,113.25
2,2,2,65,2,1,1,131,8581,1950,Israel,0,f,69,0.0,65.5,0.5
3,2,4,294,2,2,0,1556,1679080,1950,Israel,0,f,69,0.0,778.0,484.0
7,5,1,257,119,35,21,72691,53643653,1949,United States,FL,m,70,1.0,610.848739,353.848739


In [138]:
sudoku_puzzle_solved2 = pd.DataFrame(sudoku["Puzzles Solved"])
sudoku_puzzle_solved2 = sudoku_puzzle_solved2.join(sudoku.Age).join(sudoku.Average_time).\
                            join(sudoku.Diff_avg_best).join(sudoku["Best Time (s)"])
sudoku_puzzle_solved2.head()

Unnamed: 0,Puzzles Solved,Age,Average_time,Diff_avg_best,Best Time (s)
0,1,43,506.0,0.0,506
1,12,69,192.25,113.25,79
2,2,69,65.5,0.5,65
3,2,69,778.0,484.0,294
7,119,70,610.848739,353.848739,257


In [139]:
Y = sudoku["Puzzles Solved"]
X = sudoku_puzzle_solved2.drop("Puzzles Solved", axis = 1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .20, random_state = 40)
regr = linear_model.LinearRegression() 

regr.fit(X_train, Y_train)
predicted = regr.predict(X_test)
predicted

array([ 53.80197174,  77.60873444,  57.6867984 , ...,  78.36233551,
       212.25686676,  70.11312897])

In [140]:
X1 = sm.add_constant(X_train)
model = sm.OLS(Y_train, X1).fit() 

  return ptp(axis=axis, out=out, **kwargs)


In [141]:
model.summary()

0,1,2,3
Dep. Variable:,Puzzles Solved,R-squared:,0.019
Model:,OLS,Adj. R-squared:,0.019
Method:,Least Squares,F-statistic:,949.0
Date:,"Sat, 09 Nov 2019",Prob (F-statistic):,0.0
Time:,15:34:44,Log-Likelihood:,-1129900.0
No. Observations:,146569,AIC:,2260000.0
Df Residuals:,146565,BIC:,2260000.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-3.4676,4.055,-0.855,0.392,-11.414,4.479
Age,2.7955,0.075,37.071,0.000,2.648,2.943
Average_time,-0.0153,0.001,-10.591,0.000,-0.018,-0.012
Diff_avg_best,0.0431,0.003,16.254,0.000,0.038,0.048
Best Time (s),-0.0583,0.002,-31.525,0.000,-0.062,-0.055

0,1,2,3
Omnibus:,434765.217,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,96698762318.8
Skew:,42.103,Prob(JB):,0.0
Kurtosis:,3981.302,Cond. No.,9910000000000000.0


## Regression by Level

In [7]:
## Level 1
lvl1.head().reset_index()

Unnamed: 0,index,User ID,Puzzle Level,Best Time (s),Puzzles Solved,Puzzle Solved w/o Errors,Puzzle Solved w/ Errors,Total Time (s) for all Puzzles Solved,Total of (time squared) for all puzzles solved,Birth_Year,Country,US_State,Gender
0,0,1,1,506,1,1,0,506,256036,1976,Israel,0,m
1,1,2,1,79,12,11,1,2307,511683,1950,Israel,0,f
2,4,3,1,190,1,1,0,190,36100,0,,TX,0
3,7,5,1,257,119,35,21,72691,53643653,1949,United States,FL,m
4,13,8,1,156,850,785,65,253451,115729295,1963,India,0,m


In [15]:
lvl1["Diff_avg_best"] = lvl1["Average_time"] - lvl1["Best Time (s)"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [16]:
lvl1.head()

Unnamed: 0,User ID,Puzzle Level,Best Time (s),Puzzles Solved,Puzzle Solved w/o Errors,Puzzle Solved w/ Errors,Total Time (s) for all Puzzles Solved,Total of (time squared) for all puzzles solved,Birth_Year,Country,US_State,Gender,Age,Gender_numeric,Average_time,Diff_avg_best
0,1,1,506,1,1,0,506,256036,1976,Israel,0,m,43,1.0,506.0,0.0
1,2,1,79,12,11,1,2307,511683,1950,Israel,0,f,69,0.0,192.25,113.25
7,5,1,257,119,35,21,72691,53643653,1949,United States,FL,m,70,1.0,610.848739,353.848739
13,8,1,156,850,785,65,253451,115729295,1963,India,0,m,56,1.0,298.177647,142.177647
22,13,1,234,1,0,0,234,54756,1974,United States,VA,m,45,1.0,234.0,0.0


In [47]:
lvl1_puzzle_solved = pd.DataFrame(lvl1["Puzzles Solved"])
lvl1_puzzle_solved = lvl1_puzzle_solved.join(lvl1.Age).join(lvl1.Average_time).join(lvl1["Best Time (s)"])
lvl1_puzzle_solved.head()

Unnamed: 0,Puzzles Solved,Age,Average_time,Best Time (s)
0,1,43,506.0,506
1,12,69,192.25,79
7,119,70,610.848739,257
13,850,56,298.177647,156
22,1,45,234.0,234


In [48]:
Y = lvl1_puzzle_solved["Puzzles Solved"]
X = lvl1_puzzle_solved.drop("Puzzles Solved", axis = 1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .20, random_state = 40)
regr = linear_model.LinearRegression() 

regr.fit(X_train, Y_train)
predicted = regr.predict(X_test)
predicted

array([24.2119004 , 21.23629367, 72.923564  , ..., 84.20921488,
       15.43631678, 50.72385061])

In [49]:
X1 = sm.add_constant(X_train)
model = sm.OLS(Y_train, X1).fit() 

  return ptp(axis=axis, out=out, **kwargs)


In [50]:
model.summary()

0,1,2,3
Dep. Variable:,Puzzles Solved,R-squared:,0.014
Model:,OLS,Adj. R-squared:,0.014
Method:,Least Squares,F-statistic:,322.0
Date:,"Sat, 09 Nov 2019",Prob (F-statistic):,1.33e-207
Time:,16:30:18,Log-Likelihood:,-504490.0
No. Observations:,67817,AIC:,1009000.0
Df Residuals:,67813,BIC:,1009000.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.9721,4.429,1.349,0.177,-2.708,14.652
Age,1.8627,0.083,22.388,0.000,1.700,2.026
Average_time,-0.0094,0.005,-1.775,0.076,-0.020,0.001
Best Time (s),-0.0551,0.006,-9.093,0.000,-0.067,-0.043

0,1,2,3
Omnibus:,205741.923,Durbin-Watson:,1.981
Prob(Omnibus):,0.0,Jarque-Bera (JB):,30410177904.524
Skew:,45.137,Prob(JB):,0.0
Kurtosis:,3282.301,Cond. No.,3520.0


In [27]:
## Level 2
lvl2["Diff_avg_best"] = lvl2["Average_time"] - lvl2["Best Time (s)"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [28]:
lvl2.head()

Unnamed: 0,User ID,Puzzle Level,Best Time (s),Puzzles Solved,Puzzle Solved w/o Errors,Puzzle Solved w/ Errors,Total Time (s) for all Puzzles Solved,Total of (time squared) for all puzzles solved,Birth_Year,Country,US_State,Gender,Age,Gender_numeric,Average_time,Diff_avg_best
2,2,2,65,2,1,1,131,8581,1950,Israel,0,f,69,0.0,65.5,0.5
8,5,2,616,97,2,2,137276,218080970,1949,United States,FL,m,70,1.0,1415.216495,799.216495
12,7,2,409,7,2,3,7781,10452679,1958,United States,WA,f,61,0.0,1111.571429,702.571429
14,8,2,237,825,634,191,426972,289110586,1963,India,0,m,56,1.0,517.541818,280.541818
24,14,2,518,1,1,0,518,268324,1965,Egypt,0,m,54,1.0,518.0,0.0


In [51]:
lvl2_puzzle_solved = pd.DataFrame(lvl2["Puzzles Solved"])
lvl2_puzzle_solved = lvl2_puzzle_solved.join(lvl2.Age).join(lvl2.Average_time).join(lvl2["Best Time (s)"])
lvl2_puzzle_solved.head()

Unnamed: 0,Puzzles Solved,Age,Average_time,Best Time (s)
2,2,69,65.5,65
8,97,70,1415.216495,616
12,7,61,1111.571429,409
14,825,56,517.541818,237
24,1,54,518.0,518


In [52]:
Y = lvl2_puzzle_solved["Puzzles Solved"]
X = lvl2_puzzle_solved.drop("Puzzles Solved", axis = 1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .20, random_state = 40)
regr = linear_model.LinearRegression() 

regr.fit(X_train, Y_train)
predicted = regr.predict(X_test)
predicted

array([123.80700347, 136.73831846,  96.01800377, ..., 129.8904693 ,
        29.71472487, 161.18797491])

In [53]:
X1 = sm.add_constant(X_train)
model = sm.OLS(Y_train, X1).fit() 

  return ptp(axis=axis, out=out, **kwargs)


In [54]:
model.summary()

0,1,2,3
Dep. Variable:,Puzzles Solved,R-squared:,0.017
Model:,OLS,Adj. R-squared:,0.017
Method:,Least Squares,F-statistic:,191.2
Date:,"Sat, 09 Nov 2019",Prob (F-statistic):,6.07e-123
Time:,16:31:03,Log-Likelihood:,-262770.0
No. Observations:,33832,AIC:,525600.0
Df Residuals:,33828,BIC:,525600.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,10.3999,9.287,1.120,0.263,-7.804,28.604
Age,2.6714,0.166,16.118,0.000,2.347,2.996
Average_time,-0.0036,0.009,-0.401,0.688,-0.021,0.014
Best Time (s),-0.0778,0.010,-7.784,0.000,-0.097,-0.058

0,1,2,3
Omnibus:,103559.522,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,16044274590.301
Skew:,46.203,Prob(JB):,0.0
Kurtosis:,3375.398,Cond. No.,4820.0


In [34]:
## Level 3
lvl3["Diff_avg_best"] = lvl3["Average_time"] - lvl3["Best Time (s)"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [35]:
lvl3.head()

Unnamed: 0,User ID,Puzzle Level,Best Time (s),Puzzles Solved,Puzzle Solved w/o Errors,Puzzle Solved w/ Errors,Total Time (s) for all Puzzles Solved,Total of (time squared) for all puzzles solved,Birth_Year,Country,US_State,Gender,Age,Gender_numeric,Average_time,Diff_avg_best
9,5,3,2198,1,1,0,2198,4831204,1949,United States,FL,m,70,1.0,2198.0,0.0
15,8,3,272,825,335,490,575118,523259446,1963,India,0,m,56,1.0,697.112727,425.112727
17,11,3,1089,1,1,0,1089,1185921,1943,United States,TX,m,76,1.0,1089.0,0.0
25,14,3,747,1,0,1,747,558009,1965,Egypt,0,m,54,1.0,747.0,0.0
30,16,3,521,8,2,4,7324,8152340,1989,Australia,0,m,30,1.0,915.5,394.5


In [55]:
lvl3_puzzle_solved = pd.DataFrame(lvl3["Puzzles Solved"])
lvl3_puzzle_solved = lvl3_puzzle_solved.join(lvl3.Age).join(lvl3.Average_time).join(lvl3["Best Time (s)"])
lvl3_puzzle_solved.head()

Unnamed: 0,Puzzles Solved,Age,Average_time,Best Time (s)
9,1,70,2198.0,2198
15,825,56,697.112727,272
17,1,76,1089.0,1089
25,1,54,747.0,747
30,8,30,915.5,521


In [56]:
Y = lvl3_puzzle_solved["Puzzles Solved"]
X = lvl3_puzzle_solved.drop("Puzzles Solved", axis = 1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .20, random_state = 40)
regr = linear_model.LinearRegression() 

regr.fit(X_train, Y_train)
predicted = regr.predict(X_test)
predicted

array([163.65729516, 222.53419763, 155.59290172, ...,  91.12609207,
       124.73411226, 200.17870517])

In [57]:
X1 = sm.add_constant(X_train)
model = sm.OLS(Y_train, X1).fit() 

  return ptp(axis=axis, out=out, **kwargs)


In [58]:
model.summary()

0,1,2,3
Dep. Variable:,Puzzles Solved,R-squared:,0.019
Model:,OLS,Adj. R-squared:,0.019
Method:,Least Squares,F-statistic:,161.0
Date:,"Sat, 09 Nov 2019",Prob (F-statistic):,2.12e-103
Time:,16:31:50,Log-Likelihood:,-200060.0
No. Observations:,25044,AIC:,400100.0
Df Residuals:,25040,BIC:,400200.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,29.7424,14.397,2.066,0.039,1.524,57.961
Age,3.4169,0.253,13.525,0.000,2.922,3.912
Average_time,-0.0128,0.011,-1.146,0.252,-0.035,0.009
Best Time (s),-0.0811,0.012,-6.720,0.000,-0.105,-0.057

0,1,2,3
Omnibus:,83731.14,Durbin-Watson:,2.01
Prob(Omnibus):,0.0,Jarque-Bera (JB):,36400477178.288
Skew:,59.083,Prob(JB):,0.0
Kurtosis:,5908.006,Cond. No.,6160.0


In [41]:
## Level 4
lvl4["Diff_avg_best"] = lvl4["Average_time"] - lvl4["Best Time (s)"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [42]:
lvl4.head()

Unnamed: 0,User ID,Puzzle Level,Best Time (s),Puzzles Solved,Puzzle Solved w/o Errors,Puzzle Solved w/ Errors,Total Time (s) for all Puzzles Solved,Total of (time squared) for all puzzles solved,Birth_Year,Country,US_State,Gender,Age,Gender_numeric,Average_time,Diff_avg_best
3,2,4,294,2,2,0,1556,1679080,1950,Israel,0,f,69,0.0,778.0,484.0
10,5,4,3360,1,0,0,3360,11289600,1949,United States,FL,m,70,1.0,3360.0,0.0
16,8,4,339,825,125,700,658827,624195239,1963,India,0,m,56,1.0,798.578182,459.578182
18,11,4,836,976,972,4,1789891,3533206483,1943,United States,TX,m,76,1.0,1833.904713,997.904713
26,14,4,430,168,23,138,124293,95880567,1965,Egypt,0,m,54,1.0,739.839286,309.839286


In [59]:
lvl4_puzzle_solved = pd.DataFrame(lvl4["Puzzles Solved"])
lvl4_puzzle_solved = lvl4_puzzle_solved.join(lvl4.Age).join(lvl4.Average_time).join(lvl4["Best Time (s)"])
lvl4_puzzle_solved.head()

Unnamed: 0,Puzzles Solved,Age,Average_time,Best Time (s)
3,2,69,778.0,294
10,1,70,3360.0,3360
16,825,56,798.578182,339
18,976,76,1833.904713,836
26,168,54,739.839286,430


In [60]:
Y = lvl4_puzzle_solved["Puzzles Solved"]
X = lvl4_puzzle_solved.drop("Puzzles Solved", axis = 1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .20, random_state = 40)
regr = linear_model.LinearRegression() 

regr.fit(X_train, Y_train)
predicted = regr.predict(X_test)
predicted

array([346.8977693 , 183.47133989, 353.61933427, ..., 101.95876821,
       279.14989531, 305.25103333])

In [61]:
X1 = sm.add_constant(X_train)
model = sm.OLS(Y_train, X1).fit() 

  return ptp(axis=axis, out=out, **kwargs)


In [62]:
model.summary()

0,1,2,3
Dep. Variable:,Puzzles Solved,R-squared:,0.036
Model:,OLS,Adj. R-squared:,0.036
Method:,Least Squares,F-statistic:,247.3
Date:,"Sat, 09 Nov 2019",Prob (F-statistic):,1.46e-157
Time:,16:32:29,Log-Likelihood:,-159400.0
No. Observations:,19875,AIC:,318800.0
Df Residuals:,19871,BIC:,318800.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,51.9706,16.326,3.183,0.001,19.970,83.971
Age,4.6541,0.293,15.898,0.000,4.080,5.228
Average_time,0.0064,0.011,0.585,0.559,-0.015,0.028
Best Time (s),-0.1262,0.012,-10.666,0.000,-0.149,-0.103

0,1,2,3
Omnibus:,36193.549,Durbin-Watson:,2.008
Prob(Omnibus):,0.0,Jarque-Bera (JB):,67207913.295
Skew:,13.398,Prob(JB):,0.0
Kurtosis:,286.617,Cond. No.,6770.0


## How many people did not solve any puzzle?

In [87]:
puzzle = pd.read_csv("sudoku_counts.csv")

In [88]:
puzzle.head()

Unnamed: 0,User ID,Puzzle Level,Best Time (s),Puzzles Solved,Puzzle Solved w/o Errors,Puzzle Solved w/ Errors,Total Time (s) for all Puzzles Solved,Total of (time squared) for all puzzles solved,Birth_Year,Country,US_State,Gender
0,1,1,506,1,1,0,506,256036,1976,Israel,0,m
1,2,1,79,12,11,1,2307,511683,1950,Israel,0,f
2,2,2,65,2,1,1,131,8581,1950,Israel,0,f
3,2,4,294,2,2,0,1556,1679080,1950,Israel,0,f
4,3,1,190,1,1,0,190,36100,0,,TX,0


In [89]:
no_completion = puzzle.groupby("User ID")["Puzzles Solved"].sum()

In [90]:
no_completion.shape

(196480,)

In [80]:
no_completion[no_completion == 0].shape

(379,)

In [91]:
print("The % of players who did not complete any puzzle is {}"\
      .format(str(round(no_completion[no_completion == 0].shape[0]/no_completion.shape[0]*100,2)) + str("%")))

The % of players who did not complete any puzzle is 0.19%


In [92]:
puzzle1 = puzzle[puzzle["Puzzle Level"] ==1]

In [93]:
no_completion1 = puzzle1.groupby("User ID")["Puzzles Solved"].sum()

In [94]:
print("The % of players who did not complete any puzzle for lvl1 is {}"\
      .format(str(round(no_completion1[no_completion1 == 0].shape[0]/no_completion1.shape[0]*100,2)) + str("%")))

The % of players who did not complete any puzzle for lvl1 is 0.32%


In [95]:
puzzle2 = puzzle[puzzle["Puzzle Level"] ==2]

In [96]:
no_completion2 = puzzle2.groupby("User ID")["Puzzles Solved"].sum()

In [97]:
print("The % of players who did not complete any puzzle for lvl2 is {}"\
      .format(str(round(no_completion2[no_completion2 == 0].shape[0]/no_completion2.shape[0]*100,2)) + str("%")))

The % of players who did not complete any puzzle for lvl2 is 0.21%


In [98]:
puzzle3 = puzzle[puzzle["Puzzle Level"] ==3]

In [99]:
no_completion3 = puzzle3.groupby("User ID")["Puzzles Solved"].sum()

In [100]:
print("The % of players who did not complete any puzzle for lvl3 is {}"\
      .format(str(round(no_completion3[no_completion3 == 0].shape[0]/no_completion3.shape[0]*100,2)) + str("%")))

The % of players who did not complete any puzzle for lvl3 is 0.18%


In [101]:
puzzle4 = puzzle[puzzle["Puzzle Level"] ==4]

In [102]:
no_completion4 = puzzle4.groupby("User ID")["Puzzles Solved"].sum()

In [103]:
print("The % of players who did not complete any puzzle for lvl4 is {}"\
      .format(str(round(no_completion4[no_completion4 == 0].shape[0]/no_completion4.shape[0]*100,2)) + str("%")))

The % of players who did not complete any puzzle for lvl4 is 0.16%
