In [315]:
import pandas as pd
import json
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [316]:
df = pd.read_json('dwts_scraper/scores.json')
df['dance_id'] = np.arange(df.shape[0])

df.drop( df.query(" couple== ['Dance-off','Dance Duel'] ").index, inplace = True)

In [317]:
df['judge'] = df['judge_phrase'].str.split(",", expand = False)

In [318]:
# Some weeks (e.g., random have a `score` column instead of a `scores` column.  Rename that.

# https://en.wikipedia.org/wiki/Dancing_with_the_Stars_(American_season_20)#Weekly_scores Week 5.
# https://en.wikipedia.org/wiki/Dancing_with_the_Stars_(American_season_17)#Weekly_scores Week 6.


# https://stackoverflow.com/questions/34989341/how-to-remove-nan-value-while-combining-two-column-in-panda-data-frame
df['scores'] = df['scores'].fillna(df['score'])

In [319]:
df.columns

Index(['couple', 'scores', 'dance', 'music', 'result', 'season', 'week',
       'judge_phrase', 'tv_show', 'original_couple', 'score', 'film_theme',
       'disney_film', 'film', 'technical_score', 'performance_score',
       'dance_chosen_by', 'couple_trio_dance_partner_',
       'cirque_du_soleil_show', 'couple_professionals_', 'broadway_show',
       'dances_chosen_by', 'stevie_wonder_music', 'team_captain', 'era',
       'cher_music', 'movie_genre', 'couple_judge_', 'villain', 'dynamic_duo',
       'results', 'dance_theme_chosen_by', 'couple_team_up_judge_',
       'guest_performers', 'music_iconic_routine_', 'icon', 'musical', 'date',
       'performers', 'tracks_performed', 'dancers', 'britney_spears_music',
       'tribute_to_', 'grease_music', 'horror_film_show', 'queen_music',
       'unnamed_5', 'janet_jackson_music', 'dance_id', 'judge'],
      dtype='object')

In [320]:
# https://en.wikipedia.org/wiki/Dancing_with_the_Stars_(American_season_9)#Weekly_scores Week 6 has a 'ranked order' score dance.
# Dance Offs are also a problem, some are separate tables, some are bottom rows in tables.






In [321]:
# Some weeks have a technical score/performance score. but no scores column.

# https://en.wikipedia.org/wiki/Dancing_with_the_Stars_(American_season_11)#Weekly_scores Week 4
# https://en.wikipedia.org/wiki/Dancing_with_the_Stars_(American_season_10)#Weekly_scores Week 4

# double_score = df[df['technical_score'].notna()]

# double_score[double_score.columns[~double_score.isnull().any()]]

In [322]:
# Quite a few rows from tables that shouldn't be parsed (after end of week shows).  e.g. tributes.
# Either fix in parsing or drop here.

df.dropna(subset=['dance'], inplace=True)

# df[df['dance'].isna()]



In [323]:
# Some couple columns have additional info which is in parens in the column. This creates an NA couple column.

pattern = r'(?P<couple_alone>.*?)\((?P<additional>.*?)\)$'

# e.g., https://en.wikipedia.org/wiki/Dancing_with_the_Stars_(American_season_27) Week 4.  Couple (Trio Dance Partner)
df[['couple_alone','trio_partner']] = df['couple_trio_dance_partner_'].str.extract(pattern)
df['couple'] = df['couple'].fillna(df['couple_alone'])
df.drop(columns='couple_alone', inplace=True)

# https://en.wikipedia.org/wiki/Dancing_with_the_Stars_(American_season_16) Week 5 (Professionals).
df[['couple_alone','additional_dancers']] = df['couple_professionals_'].str.extract(pattern)
df['couple'] = df['couple'].fillna(df['couple_alone'])
df.drop(columns='couple_alone', inplace=True)

# couple_team_up_judge_ from https://en.wikipedia.org/wiki/Dancing_with_the_Stars_(American_season_22)#Week_8:_Judges'_Team-up_Challenge
# Some of These are multi-couple dances, with a judge listed i the couple column, and then an X for the score that the judge that coached the team would have given.
# but others are regular dances (just a different column name)
df['couple'] = df['couple'].fillna(df['couple_team_up_judge_'])

# couple_judge_  from https://en.wikipedia.org/wiki/Dancing_with_the_Stars_(American_season_11)#Week_10:_Finals is much simpler (just a judge chosen dance).
df[['couple_alone','judge_choosing_dance']] = df['couple_judge_'].str.extract(pattern)
df['couple'] = df['couple'].fillna(df['couple_alone'])
df.drop(columns='couple_alone', inplace=True)

# One remains from the couple_judge_ due to a substitution which then didn't have a judge (so the regex above doesn't catch it)
# 2341 where the couple_judge_ is Nastia & Sasha[a]
df['couple'] = df['couple'].fillna(df['couple_judge_'])
# df.iloc[2341]

df.drop(columns=['couple_trio_dance_partner_', 'couple_professionals_', 'couple_judge_'], inplace=True)

# df[df['couple'].isna()]

In [324]:
# Some have technical_score and performance_score.
# add a score_type column.
# split these out, going to be two rows, each with scores column and a score_type column.

double_scores = ( df[df['technical_score'].notna()]
                  .drop(columns="scores")
                  .dropna(axis = 1, how = "all") )

val_cols = ['technical_score','performance_score']
id_cols = [ele for ele in double_scores if ele not in val_cols]
# now get two rows per
double_scores = double_scores.melt(id_vars = id_cols, value_vars = val_cols, value_name = "scores", var_name = "score_type")

single_scores = df.drop(df.index[df['technical_score'].notna()])
# make others be score_type
single_scores['score_type'] = "single_score"
single_scores.drop(columns=['technical_score','performance_score'], inplace=True)

df_score_type = pd.concat([double_scores, single_scores])


In [325]:
# df_score_type[df_score_type['score_type'].isna()]

In [326]:
df_score_type[['total_score','indiv_scores']] = df_score_type['scores'].str.extract(r'(?P<total_score>\d+)\s\((?P<indiv_scores>.*)\)')

# new_cols['judge_score_list'] = new_cols['indiv_scores'].str.split(",")


# df_new = pd.concat([df, new_cols], axis = 1)

In [327]:
# song_cols = df['music'].str.extract(r'"(?P<song_name>.*)"—(?P<song_artist>.*)$')
# song_cols
# df["music"].str.split(" / ") Some music columns have multiples.

In [328]:


# most are multi couple dances, with ranking scores.

df_score_type["couple_list"] = df_score_type["couple"].str.split("---")
df_score_type["couple_count"] = df_score_type["couple_list"].apply(len)
# a few have a total score, but not an individual score.  e.g., s9w10.

In [329]:
df_score_type[df_score_type['total_score'].isna()]

Unnamed: 0,couple,dance,music,result,season,week,judge_phrase,dance_id,judge,score_type,scores,tv_show,original_couple,score,film_theme,disney_film,film,dance_chosen_by,cirque_du_soleil_show,broadway_show,dances_chosen_by,stevie_wonder_music,team_captain,era,cher_music,movie_genre,villain,dynamic_duo,results,dance_theme_chosen_by,couple_team_up_judge_,guest_performers,music_iconic_routine_,icon,musical,date,performers,tracks_performed,dancers,britney_spears_music,tribute_to_,grease_music,horror_film_show,queen_music,unnamed_5,janet_jackson_music,trio_partner,additional_dancers,judge_choosing_dance,total_score,indiv_scores,couple_list,couple_count
137,Chuck & Anna T.---Ashley & Edyta---Donny & Kym...,Salsa,"""Get Busy""—Sean Paul","""Get Busy""—Sean Paul",9,Week 1,"Carrie Ann Inaba, Len Goodman, Bruno Tonioli.",137,"[ Carrie Ann Inaba, Len Goodman, Bruno Tonio...",single_score,6---4---10---8,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,"[Chuck & Anna T., Ashley & Edyta, Donny & Kym,...",4
138,Aaron & Karina---Mark & Lacey---Tom & Cheryl--...,Viennese waltz,"""I'm Your Man""—Leonard Cohen","""I'm Your Man""—Leonard Cohen",9,Week 1,"Carrie Ann Inaba, Len Goodman, Bruno Tonioli.",138,"[ Carrie Ann Inaba, Len Goodman, Bruno Tonio...",single_score,10---8---4---6,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,"[Aaron & Karina, Mark & Lacey, Tom & Cheryl, M...",4
433,Diana & Henry,Cha-cha-cha,"""Move Your Feet""—Junior Senior",Eliminated,18,Week 2: Celebrity's Pick Night,"Carrie Ann Inaba, Len Goodman, Bruno Tonioli",433,"[ Carrie Ann Inaba, Len Goodman, Bruno Tonioli]",single_score,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,[Diana & Henry],1
629,Wynonna & Tony---Victor & Lindsay---D.L. & Che...,Freestyle,"""The Rockafeller Skank""—Fatboy Slim","""The Rockafeller Skank""—Fatboy Slim",16,Week 3: Prom Night,"Carrie Ann Inaba, Len Goodman, Bruno Tonioli.",629,"[ Carrie Ann Inaba, Len Goodman, Bruno Tonio...",single_score,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,"[Wynonna & Tony, Victor & Lindsay, D.L. & Cher...",11
789,Misty & Maks,Jive,"""Shake It""—Metro Station",Withdrew,7,Week 3,"Carrie Ann Inaba, Len Goodman, Bruno Tonioli.",789,"[ Carrie Ann Inaba, Len Goodman, Bruno Tonio...",single_score,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,[Misty & Maks],1
847,Ray & Cheryl,Cha-cha-cha,"""Twist and Shout""—The Beatles",Withdrew,28,Week 3: Movie Night,"Carrie Ann Inaba, Len Goodman, Bruno Tonioli.",847,"[ Carrie Ann Inaba, Len Goodman, Bruno Tonio...",single_score,,,,,,,Ferris Bueller's Day Off,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,[Ray & Cheryl],1
957,Shannon & Derek---Marissa & Tony---Kristi & Ma...,Two-step,"""Cotton-Eyed Joe""—The Nashville Riders","""Cotton-Eyed Joe""—The Nashville Riders",6,Week 6,"Carrie Ann Inaba, Len Goodman, Bruno Tonioli.",957,"[ Carrie Ann Inaba, Len Goodman, Bruno Tonio...",single_score,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,"[Shannon & Derek, Marissa & Tony, Kristi & Mar...",7
1074,Stacy & Tony---Tia & Maksim---Lisa & Louis---G...,Salsa,"""Rhythm is Gonna Get You""—Gloria Estefan","""Rhythm is Gonna Get You""—Gloria Estefan",2,Week 5,"Carrie Ann Inaba, Len Goodman, Bruno Tonioli.",1074,"[ Carrie Ann Inaba, Len Goodman, Bruno Tonio...",single_score,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,"[Stacy & Tony, Tia & Maksim, Lisa & Louis, Geo...",6
1121,Jane & Tony---Cameron & Edyta---Sabrina & Mark...,Rock and roll,"""Rockin' Robin""—Bobby Day","""Rockin' Robin""—Bobby Day",5,Week 6,"Carrie Ann Inaba, Len Goodman, Bruno Tonioli.",1121,"[ Carrie Ann Inaba, Len Goodman, Bruno Tonio...",single_score,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,"[Jane & Tony, Cameron & Edyta, Sabrina & Mark,...",7
1240,Drew & Cheryl---Stacy & Tony---George & Edyta-...,Viennese waltz,"""Fallin'""—Alicia Keys","""Fallin'""—Alicia Keys",2,Week 6,"Carrie Ann Inaba, Len Goodman, Bruno Tonioli.",1240,"[ Carrie Ann Inaba, Len Goodman, Bruno Tonio...",single_score,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,"[Drew & Cheryl, Stacy & Tony, George & Edyta, ...",5


In [None]:
df_final = ( df_new
              #  .head(5)
               .explode(['judge','judge_score_list'])
               .rename(columns={'judge_score_list': 'judge_score'})
               .drop(columns=['indiv_scores','judge_phrase','scores'])
               .assign(judge = lambda df: df.judge.str.strip(" ."))
)

In [None]:
all_cols = df_final.columns.values.tolist()

front_cols = [ 'dance_id','season','week','couple','dance','music','judge','judge_score','total_score' ]

remaining_cols = [ele for ele in all_cols if ele not in front_cols]

reordered = front_cols + remaining_cols

In [None]:
df_final = df_final[reordered]

In [None]:
df_final