In [1]:
import numpy as np
import pandas as pd

In [2]:
# Read csv
df = pd.read_csv('ranges.csv')

In [3]:
# Range and Measurement corralate quite much, but we cant calculate correlation,
# Because Range is text and correlation works only with numeric values
df

Unnamed: 0,Measurement,Region,Range
0,881792,Europe,High
1,867134,America,High
2,241654,Asia,Low
3,456234,Europe,Middle
4,301892,America,Low
5,654132,America,High
6,540275,Asia,Middle
7,467122,Asia,Middle
8,68512,Europe,Low


In [4]:
# Use factorize() to convert Range -text to numeric representation
values1, names1 = pd.factorize(df['Range'], sort=False)

# Add assigned numbers as new column to DataFrame
df['RangeValue'] = values1

In [5]:
# NOTE: factorize() doesn't know th real order of values
# It will assign next available number based on data
# This case it notices High first => assign 0
# Then Low => assign 1
# Finally Middle => assign 2
df

Unnamed: 0,Measurement,Region,Range,RangeValue
0,881792,Europe,High,0
1,867134,America,High,0
2,241654,Asia,Low,1
3,456234,Europe,Middle,2
4,301892,America,Low,1
5,654132,America,High,0
6,540275,Asia,Middle,2
7,467122,Asia,Middle,2
8,68512,Europe,Low,1


In [6]:
# Check how correlations are incorrect, note negative values
df.corr(numeric_only=True)

Unnamed: 0,Measurement,RangeValue
Measurement,1.0,-0.494689
RangeValue,-0.494689,1.0


<b>Version 2: Use factorize with sorting - Ordinal category (ranked category)</b>

In [7]:
# Reload data and SORT BY MEASUREMENT to avoid wron numeric ranges by factorize()
# It doesn't always work if sorting can't done easily
df = pd.read_csv('ranges.csv')
df = df.sort_values('Measurement')

# Use factorize() to convert Range -text to numeric representation
values1, names1 = pd.factorize(df['Range'], sort=False)

# Add assigned numbers as new column to DataFrame
df['RangeValue'] = values1

In [8]:
df

Unnamed: 0,Measurement,Region,Range,RangeValue
8,68512,Europe,Low,0
2,241654,Asia,Low,0
4,301892,America,Low,0
3,456234,Europe,Middle,1
7,467122,Asia,Middle,1
6,540275,Asia,Middle,1
5,654132,America,High,2
1,867134,America,High,2
0,881792,Europe,High,2


In [9]:
# Now correlations are ok
df.corr(numeric_only=True)

Unnamed: 0,Measurement,RangeValue
Measurement,1.0,0.943116
RangeValue,0.943116,1.0


<b>Version 3: Use dictionary and replace() - Ordinal category (ranked category)<br></br>
THIS IS THE RECOMMENDED APPROACH WITH ORDINALE/RANKED CATEGORIES</b>

In [10]:
# Usually best way to work with ordinal categories is to ma replacement with
# Pythonb dictionary and pandas replace -function
df = pd.read_csv('ranges.csv')
category_mapper = {'High': 2, 'Middle': 1, 'Low': 0}
df['RangeValue'] = df['Range'].replace(category_mapper)

In [11]:
# Now it works
df

Unnamed: 0,Measurement,Region,Range,RangeValue
0,881792,Europe,High,2
1,867134,America,High,2
2,241654,Asia,Low,0
3,456234,Europe,Middle,1
4,301892,America,Low,0
5,654132,America,High,2
6,540275,Asia,Middle,1
7,467122,Asia,Middle,1
8,68512,Europe,Low,0


In [12]:
# Check correlation again
df.corr(numeric_only=True)

Unnamed: 0,Measurement,RangeValue
Measurement,1.0,0.943116
RangeValue,0.943116,1.0
