# Case study: Passwords

## Import data

In [3]:
import pandas as pd

ROOT = "https://raw.githubusercontent.com/kirenz/modern-statistics/main/data/"
DATA = "passwords.csv"

df = pd.read_csv(ROOT + DATA)

## Data inspection

In [4]:
df

Unnamed: 0,rank,password,category,value,time_unit,offline_crack_sec,rank_alt,strength,font_size
0,1,password,password-related,6.91,years,2.170000e+00,1,8,11
1,2,123456,simple-alphanumeric,18.52,minutes,1.110000e-05,2,4,8
2,3,12345678,simple-alphanumeric,1.29,days,1.110000e-03,3,4,8
3,4,1234,simple-alphanumeric,11.11,seconds,1.110000e-07,4,4,8
4,5,qwerty,simple-alphanumeric,3.72,days,3.210000e-03,5,8,11
...,...,...,...,...,...,...,...,...,...
495,496,reddog,cool-macho,3.72,days,3.210000e-03,498,6,10
496,497,alexande,name,6.91,years,2.170000e+00,499,9,12
497,498,college,nerdy-pop,3.19,months,8.350000e-02,500,7,11
498,499,jester,name,3.72,days,3.210000e-03,501,7,11


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   rank               500 non-null    int64  
 1   password           500 non-null    object 
 2   category           500 non-null    object 
 3   value              500 non-null    float64
 4   time_unit          500 non-null    object 
 5   offline_crack_sec  500 non-null    float64
 6   rank_alt           500 non-null    int64  
 7   strength           500 non-null    int64  
 8   font_size          500 non-null    int64  
dtypes: float64(2), int64(4), object(3)
memory usage: 35.3+ KB


In [5]:
df.describe()

Unnamed: 0,rank,value,offline_crack_sec,rank_alt,strength,font_size
count,500.0,500.0,500.0,500.0,500.0,500.0
mean,250.5,5.60266,0.5000096,251.224,7.432,10.298
std,144.481833,8.436005,2.658132,145.052163,5.415536,3.651282
min,1.0,1.29,1.11e-07,1.0,0.0,0.0
25%,125.75,3.43,0.00321,125.75,6.0,10.0
50%,250.5,3.72,0.00321,251.5,7.0,11.0
75%,375.25,3.72,0.0835,376.25,8.0,11.0
max,500.0,92.27,29.27,502.0,48.0,28.0


## Data transformation

In [7]:
df["category"] = df["category"].astype("category")
df["time_unit"] = df["time_unit"].astype("category")
df["strength"] = df["strength"].astype("category")

In [8]:
df.dtypes

rank                    int64
password               object
category             category
value                 float64
time_unit            category
offline_crack_sec     float64
rank_alt                int64
strength             category
font_size               int64
dtype: object

In [13]:
df.describe(include="category")

Unnamed: 0,category,time_unit,strength
count,500,500,500
unique,10,7,22
top,name,days,8
freq,183,238,162


## Data exploration

In [5]:
%matplotlib inline
import seaborn as sns

sns.set_theme(style="ticks", color_codes=True)

In [1]:
# plot the number of observations per category

#sns.catplot(y="___", 
#            kind="count", 
#            data=___);

## More advanced plot

Next, we want to make a plot where we only show the top 10 categories. 

Furthermore, we want to show the categories as a bar chart in descending order from highest to lowest.

In [16]:
# performs a count of values in a category
df['category'].value_counts()

name                   183
cool-macho              79
simple-alphanumeric     61
fluffy                  44
sport                   37
nerdy-pop               30
animal                  29
password-related        15
food                    11
rebellious-rude         11
Name: category, dtype: int64

In [17]:
# select the first 10 observations 
df['category'].iloc[:10]

0       password-related
1    simple-alphanumeric
2    simple-alphanumeric
3    simple-alphanumeric
4    simple-alphanumeric
5    simple-alphanumeric
6                 animal
7                  sport
8                  sport
9       password-related
Name: category, dtype: object

In [20]:
# select the index of the first 10 observations
df['category'].iloc[:10].index

RangeIndex(start=0, stop=10, step=1)

In [21]:
# make a top 10 list for the variable category

#TOP_10 = df['category'].___.___.___

In [None]:
sns.catplot(y="category", 
            kind="count", 
            palette="ch:.25", 
            data=df,
            order = TOP_10);