<a href="https://colab.research.google.com/github/hthomas229/PurpleCrown/blob/main/sqlpyr_rank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Rank in Python Pandas Package

***Parameters:***

**method:** Determines how to assign ranks to ties:

  **'average':** Default. Assigns the average rank to ties.

  **'min':** Assigns the minimum rank to ties.

  **'max'**: Assigns the maximum rank to ties.

  **'first':** Assigns ranks in the order of their appearance.

  **'dense':** Like 'min', but ranks are consecutive.

**ascending:** Sorts in ascending (True) or descending (False) order.

**na_option: **Handles NaN values ('keep', 'top', or 'bottom').

Get Data

In [None]:
# prompt: upload csv from pc     acmecleaning1  VM> PBI -> AD

import pandas as pd
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

# Assuming the uploaded file is a CSV, read it into a pandas DataFrame
df = pd.read_csv(fn)  # Replace 'fn' with the actual filename if needed

# Now you can work with the DataFrame 'df'
df.head()

Saving acmecleaning 1.csv to acmecleaning 1 (1).csv
User uploaded file "acmecleaning 1 (1).csv" with length 2177 bytes


Unnamed: 0,EmployeeID,FirstName,LastName,Phone,Address,Email,Department,Salary
0,3001,James,Smith,482-910-2756,"123 Maple St, Springfield, IL 62701",JamesSmith@acmewidgets.com,IT,90000
1,3002,Mary,Johnson,(159)287-4630,"456 Oak Ave, Madison, WI 53703",MaryJohnson@acme.com,I.T.,90000
2,3003,Robert,Williams,9736125840,"789 Pine Rd, Austin, TX 73301",RobertWilliams@acmewidgets.com,Marketing,80000
3,3004,Patricia,Brown,608 359 1274,"101 Elm St, Denver, CO 80202",PatriciaBrown@acmewidgets.com,HR,100000
4,3005,John,Jones,245-170-9386,"202 Cedar Blvd, Miami, FL 33101",JohnJones@acmewidgets.com,Admin,110000


In [None]:
df_rank = df[['LastName','Department', 'Salary']]


In [None]:
df_rank

Unnamed: 0,LastName,Department,Salary
0,Smith,IT,90000
1,Johnson,I.T.,90000
2,Williams,Marketing,80000
3,Brown,HR,100000
4,Jones,Admin,110000
5,Garcia,Management,120000
6,Miller,IT,95000
7,Davis,Sales,75000
8,Rodriguez,MKTG,85000
9,Martinez,Marketing,80000


In [None]:
#average is the default and returns the mean of all ties with the skip method
#no SQL equivalent -- rank in R

df_rank['Salary_Rank'] = df['Salary'].rank(method='average', ascending=False)
df_rank

Unnamed: 0,LastName,Department,Salary,Salary_Rank
5,Garcia,Management,120000,1.0
18,Jackson,Management,115000,2.0
4,Jones,Admin,110000,3.5
11,Lopez,Admin,110000,3.5
3,Brown,HR,100000,5.0
14,Anderson,IT,95000,6.5
6,Miller,IT,95000,6.5
0,Smith,IT,90000,9.0
16,Taylor,I.T.,90000,9.0
1,Johnson,I.T.,90000,9.0


In [None]:
# first -- numbers sequentially like ROW_NUMBER in SQL and R

df_rank['Salary_Rank'] = df['Salary'].rank(method='first', ascending=False)

df_rank

Unnamed: 0,LastName,Department,Salary,Salary_Rank
5,Garcia,Management,120000,1.0
18,Jackson,Management,115000,2.0
4,Jones,Admin,110000,3.0
11,Lopez,Admin,110000,4.0
3,Brown,HR,100000,5.0
14,Anderson,IT,95000,7.0
6,Miller,IT,95000,6.0
0,Smith,IT,90000,8.0
16,Taylor,I.T.,90000,10.0
1,Johnson,I.T.,90000,9.0


In [None]:
# min -- uses skip method for ties  -- takes the first instance


df_rank['Salary_Rank'] = df['Salary'].rank(method='min', ascending=False)
df_rank

Unnamed: 0,LastName,Department,Salary,Salary_Rank
5,Garcia,Management,120000,1.0
18,Jackson,Management,115000,2.0
4,Jones,Admin,110000,3.0
11,Lopez,Admin,110000,3.0
3,Brown,HR,100000,5.0
14,Anderson,IT,95000,6.0
6,Miller,IT,95000,6.0
0,Smith,IT,90000,8.0
16,Taylor,I.T.,90000,8.0
1,Johnson,I.T.,90000,8.0


In [None]:
# max -- skip method -- takes last instance
#no R or SQL equivalent

df_rank['Salary_Rank'] = df['Salary'].rank(method='max', ascending=False)
df_rank

Unnamed: 0,LastName,Department,Salary,Salary_Rank
5,Garcia,Management,120000,1.0
18,Jackson,Management,115000,2.0
4,Jones,Admin,110000,4.0
11,Lopez,Admin,110000,4.0
3,Brown,HR,100000,5.0
14,Anderson,IT,95000,7.0
6,Miller,IT,95000,7.0
0,Smith,IT,90000,10.0
16,Taylor,I.T.,90000,10.0
1,Johnson,I.T.,90000,10.0


In [None]:
# dense -- no skips; DENSE_RANK and dense_rank in SQL and R respectively


df_rank['Salary_Rank'] = df['Salary'].rank(method='dense', ascending=False)
df_rank

Unnamed: 0,LastName,Department,Salary,Salary_Rank
5,Garcia,Management,120000,1.0
18,Jackson,Management,115000,2.0
4,Jones,Admin,110000,3.0
11,Lopez,Admin,110000,3.0
3,Brown,HR,100000,4.0
14,Anderson,IT,95000,5.0
6,Miller,IT,95000,5.0
0,Smith,IT,90000,6.0
16,Taylor,I.T.,90000,6.0
1,Johnson,I.T.,90000,6.0


In [None]:
# qcut equivalent to Ntile in SQL and R,  cannot have duplicates(ties) SQL and Pandas can
#treis to split evenly but keeps ties in groups

df_rank['Salary_Quartile'] = pd.qcut(
    df_rank['Salary'],
    q=7, # of bins
    labels=False, # assign labels to the bins
    duplicates='drop' #drop duplicates
) +1

df_rank

Unnamed: 0,LastName,Department,Salary,Salary_Rank,Salary_Quartile
5,Garcia,Management,120000,1.0,7
18,Jackson,Management,115000,2.0,7
4,Jones,Admin,110000,3.0,6
11,Lopez,Admin,110000,3.0,6
3,Brown,HR,100000,4.0,6
14,Anderson,IT,95000,5.0,5
6,Miller,IT,95000,5.0,5
0,Smith,IT,90000,6.0,4
16,Taylor,I.T.,90000,6.0,4
1,Johnson,I.T.,90000,6.0,4


In [None]:
# group ranked salaries by department
df_rank['Salary_Rank'] = df.groupby('Department')['Salary'].rank(method='dense', ascending=False)

# Sort by department and rank
df_rank = df_rank.sort_values(by=['Department', 'Salary_Rank'])

df_rank

Unnamed: 0,LastName,Department,Salary,Salary_Rank,Salary_Quartile
4,Jones,Admin,110000,1.0,6
11,Lopez,Admin,110000,1.0,6
3,Brown,HR,100000,1.0,6
16,Taylor,I.T.,90000,1.0,4
1,Johnson,I.T.,90000,1.0,4
14,Anderson,IT,95000,1.0,5
6,Miller,IT,95000,1.0,5
0,Smith,IT,90000,2.0,4
8,Rodriguez,MKTG,85000,1.0,3
5,Garcia,Management,120000,1.0,7
