# Ratio scale

In [None]:
# 1) Units are equally spaced
# 2) MAthematical operations of +-/* are all valid
# E.g. height and weight

# Interval scale

In [None]:
# 1) Units are equally spaced, but there is no true zero
# 2) Operations of /* are not valid
# E.g. Temperature in celsius or Farenheit, the direction on a compass

# Ordinal scale

In [None]:
# The order of the units is important, but not evenly spaced
# E.g. Letter grades such as A+, A are a good example

# Nominal scale

In [None]:
# Categories of data, but the categories have no order with respect to one another
# E.g. Teams of a sport

# Converting scales

In [5]:
# We import pandas library
import pandas as pd

# We create a DataFrame with letter grades in descending order
df=pd.DataFrame(['A+','A','A-','B+','B','B-','C+','C','C-','D+','D'],
               index=['excellent','excellent','excellent','good','good','good', 'ok','ok','ok','poor','poor'],
               columns=['Grades'])
df

Unnamed: 0,Grades
excellent,A+
excellent,A
excellent,A-
good,B+
good,B
good,B-
ok,C+
ok,C
ok,C-
poor,D+


In [6]:
df.dtypes

Grades    object
dtype: object

In [7]:
# We change the type to category with the astype() function
df['Grades'].astype('category').head()

excellent    A+
excellent     A
excellent    A-
good         B+
good          B
Name: Grades, dtype: category
Categories (11, object): [A, A+, A-, B, ..., C+, C-, D, D+]

In [10]:
# We tell pandas that the data is ordered by first creating a new categorical data type with the ordered=True flag
my_categories=pd.CategoricalDtype(categories=['D','D+','C-','C','C+','B-','B','B+','A-','A','A+'], ordered=True)
grades=df['Grades'].astype(my_categories)
grades.head()

excellent    A+
excellent     A
excellent    A-
good         B+
good          B
Name: Grades, dtype: category
Categories (11, object): [D < D+ < C- < C ... B+ < A- < A < A+]

In [11]:
# We try to to obtain the list of grades greater than C, but we see we obtain bizarre results
df[df['Grades']>'C']

Unnamed: 0,Grades
ok,C+
ok,C-
poor,D+
poor,D


In [12]:
# However it works if we apply the comparison to the grades variable, with the type set as ordered categorical
grades[grades>'C']

excellent    A+
excellent     A
excellent    A-
good         B+
good          B
good         B-
ok           C+
Name: Grades, dtype: category
Categories (11, object): [D < D+ < C- < C ... B+ < A- < A < A+]

In [None]:
# If we want to represent categorical values as each being a column with a true or a false as to whether the category 
# applies, we can use the function called get_dummies(), which will convert the values of a single column into multiple
# columns of zeros and ones indicating the presence of the dummy variable

In [None]:
# A common scale-based operation consists in converting a scale from the interval or ratio scale into a categorical one.

In [13]:
# We import libraries
import numpy as np

# We load the data
df=pd.read_csv('datasets/census.csv')

# We focus on the country data
df=df[df['SUMLEV']==50]

# And only on a few groups
df=df.set_index('STNAME').groupby(level=0)['CENSUS2010POP'].agg(np.average)

df.head()

STNAME
Alabama        71339.343284
Alaska         24490.724138
Arizona       426134.466667
Arkansas       38878.906667
California    642309.586207
Name: CENSUS2010POP, dtype: float64

In [14]:
# Now we can use the cut() function to make the data categorical
pd.cut(df,10)

STNAME
Alabama                   (11706.087, 75333.413]
Alaska                    (11706.087, 75333.413]
Arizona                 (390320.176, 453317.529]
Arkansas                  (11706.087, 75333.413]
California              (579312.234, 642309.586]
Colorado                 (75333.413, 138330.766]
Connecticut             (390320.176, 453317.529]
Delaware                (264325.471, 327322.823]
District of Columbia    (579312.234, 642309.586]
Florida                 (264325.471, 327322.823]
Georgia                   (11706.087, 75333.413]
Hawaii                  (264325.471, 327322.823]
Idaho                     (11706.087, 75333.413]
Illinois                 (75333.413, 138330.766]
Indiana                   (11706.087, 75333.413]
Iowa                      (11706.087, 75333.413]
Kansas                    (11706.087, 75333.413]
Kentucky                  (11706.087, 75333.413]
Louisiana                 (11706.087, 75333.413]
Maine                    (75333.413, 138330.766]
Maryland     

In [None]:
# Note the spacing between each catergory is equally sized 