# Combining the original dramalist dataset with the reviewed dramas dataset
### In order to have more features for our dramalist recommender system, I will combine the dramalist dataset with the parquet user rating data that has the series id number (sid). 


In [3]:
import pandas as pd
import numpy as np

### importing and reviewing dramalist dataset

In [4]:
df = pd.read_csv('thisIsLast(fixedyear).csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4797 entries, 0 to 4796
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             4797 non-null   object 
 1   category         4797 non-null   object 
 2   country          4797 non-null   object 
 3   num_episodes     4797 non-null   float64
 4   aired            4797 non-null   int64  
 5   orginal_network  4461 non-null   object 
 6   duration         4652 non-null   object 
 7   watchers         4797 non-null   int64  
 8   director         3676 non-null   object 
 9   screenwriter     2867 non-null   object 
 10  rating           4797 non-null   float64
 11  num_raters       4797 non-null   int64  
 12  cast_names       4793 non-null   object 
 13  genre_names      4769 non-null   object 
 14  tag_names        4797 non-null   object 
 15  synopsis         4672 non-null   object 
 16  url              4797 non-null   object 
dtypes: float64(2),

In [6]:
df.head()

Unnamed: 0,Name,category,country,num_episodes,aired,orginal_network,duration,watchers,director,screenwriter,rating,num_raters,cast_names,genre_names,tag_names,synopsis,url
0,Tunnel,Drama,South Korea,16.0,2017,OCN,60 min.,32651,"Nam Ki Hoon, Shin Yong Hwi",Lee Eun Mi,8.7,14787,"Choi Jin Hyuk,Yoon Hyun Min,Lee Yoo Young,Jo H...","Thriller,Mystery,Sci-Fi,Fantasy","Time Travel,Murder,Criminal Profiler,Serial K...","In 1986, Park Gwang Ho works as an excellent a...",https://i.mydramalist.com/JkryYc.jpg?v=1
1,Twenty-Five Twenty-One,Drama,South Korea,16.0,2022,Netflix tvN,1 hr. 13 min.,16043,Jung Ji Hyun,Kwon Do Eun,8.8,2119,"Kim Tae Ri,Nam Joo Hyuk,Bona,Choi Hyun Wook,Le...","Romance,Life,Youth,Drama","Athlete,Fencing,1990s,Coming Of Age,Bold Fema...",The story is set in 1998 and tells the stories...,https://i.mydramalist.com/ROOPo_4c.jpg?v=1
2,Dr. Romantic 2,Drama,South Korea,16.0,2020,SBS,1 hr. 10 min.,31855,"Yoo In Shik, Lee Gil Bok",Kang Eun Kyung,8.7,15880,"Han Seok Kyu,Ahn Hyo Seop,Lee Sung Kyung,Kim J...","Romance,Drama,Medical,Melodrama","Hospital,Smart Male Lead,Character Developmen...","A real doctor story set in a small, humble h...",https://i.mydramalist.com/Rr7DEc.jpg?v=1
3,Move to Heaven,Drama,South Korea,10.0,2021,Netflix,52 min.,40962,Kim Sung Ho,Yoon Ji Ryun,9.2,20399,"Lee Je Hoon,Tang Jun Sang,Hong Seung Hee,Jung ...","Life,Drama,Family","Autism,Uncle-Nephew Relationship,Death,Savant...",Geu Roo is a young autistic man. He works for ...,https://i.mydramalist.com/Rle36_4c.jpg?v=1
4,Crash Landing on You,Drama,South Korea,16.0,2020,Netflix tvN,1 hr. 25 min.,110352,Lee Jeong Hyo,Park Ji Eun,9.0,62174,"Hyun Bin,Son Ye Jin,Seo Ji Hye,Kim Jung Hyun,Y...","Military,Comedy,Romance,Political","North And South Korea,Star Crossed Lovers,Str...","After getting into a paragliding accident, Sou...",https://i.mydramalist.com/XrN2dc.jpg?v=1


In [7]:
# multiple columns of data that could gain potential insights to user preferences 
df.columns

Index(['Name', 'category', 'country', 'num_episodes', 'aired',
       'orginal_network', 'duration', 'watchers', 'director', 'screenwriter',
       'rating', 'num_raters', 'cast_names', 'genre_names', 'tag_names',
       'synopsis', 'url'],
      dtype='object')

### parquet review dataset 

In [8]:
shows = pd.read_parquet('df_shows.parquet', engine='pyarrow')
shows = shows[shows['kind']== 'Drama']

In [9]:
shows.columns

Index(['country', 'year', 'kind', 'title', 'sid'], dtype='object')

In [10]:
df['Name'] = df['Name'].str.lower()
shows['title'] = shows['title'].str.lower()

In [12]:
shows.head()

Unnamed: 0,country,year,kind,title,sid
0,South Korea,2019,Drama,melting me softly,25652
1,South Korea,2021,Drama,snowdrop,33603
2,South Korea,2020,Drama,365: repeat the year,29824
3,South Korea,2019,Drama,big issue,24045
4,South Korea,2019,Drama,graceful family,25688


In [13]:
## as we can see above, not much data to use for recommender system, so will combine with above dataset

In [14]:
#using matching data to merge the two datasets, 'Name', 'country', 'category', 'aired'
exact_match = pd.merge(df, shows, left_on=['Name', 'country', 'category', 'aired'], right_on=['title', 'country', 'kind','year'], how='left')

exact_match


Unnamed: 0,Name,category,country,num_episodes,aired,orginal_network,duration,watchers,director,screenwriter,...,cast_names,genre_names,tag_names,synopsis,url,adjusted_year,year,kind,title,sid
0,tunnel,Drama,South Korea,16.0,2017,OCN,60 min.,32651,"Nam Ki Hoon, Shin Yong Hwi",Lee Eun Mi,...,"Choi Jin Hyuk,Yoon Hyun Min,Lee Yoo Young,Jo H...","Thriller,Mystery,Sci-Fi,Fantasy","Time Travel,Murder,Criminal Profiler,Serial K...","In 1986, Park Gwang Ho works as an excellent a...",https://i.mydramalist.com/JkryYc.jpg?v=1,2016,2017.0,Drama,tunnel,15673.0
1,twenty-five twenty-one,Drama,South Korea,16.0,2022,Netflix tvN,1 hr. 13 min.,16043,Jung Ji Hyun,Kwon Do Eun,...,"Kim Tae Ri,Nam Joo Hyuk,Bona,Choi Hyun Wook,Le...","Romance,Life,Youth,Drama","Athlete,Fencing,1990s,Coming Of Age,Bold Fema...",The story is set in 1998 and tells the stories...,https://i.mydramalist.com/ROOPo_4c.jpg?v=1,2021,2022.0,Drama,twenty-five twenty-one,40541.0
2,dr. romantic 2,Drama,South Korea,16.0,2020,SBS,1 hr. 10 min.,31855,"Yoo In Shik, Lee Gil Bok",Kang Eun Kyung,...,"Han Seok Kyu,Ahn Hyo Seop,Lee Sung Kyung,Kim J...","Romance,Drama,Medical,Melodrama","Hospital,Smart Male Lead,Character Developmen...","A real doctor story set in a small, humble h...",https://i.mydramalist.com/Rr7DEc.jpg?v=1,2019,,,,
3,move to heaven,Drama,South Korea,10.0,2021,Netflix,52 min.,40962,Kim Sung Ho,Yoon Ji Ryun,...,"Lee Je Hoon,Tang Jun Sang,Hong Seung Hee,Jung ...","Life,Drama,Family","Autism,Uncle-Nephew Relationship,Death,Savant...",Geu Roo is a young autistic man. He works for ...,https://i.mydramalist.com/Rle36_4c.jpg?v=1,2020,2021.0,Drama,move to heaven,29419.0
4,crash landing on you,Drama,South Korea,16.0,2020,Netflix tvN,1 hr. 25 min.,110352,Lee Jeong Hyo,Park Ji Eun,...,"Hyun Bin,Son Ye Jin,Seo Ji Hye,Kim Jung Hyun,Y...","Military,Comedy,Romance,Political","North And South Korea,Star Crossed Lovers,Str...","After getting into a paragliding accident, Sou...",https://i.mydramalist.com/XrN2dc.jpg?v=1,2019,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4797,ultraman dyna,Drama,Japan,51.0,1998,MBS,24 min.,253,Konaka Kazuya,"Hasegawa Keiichi, Migita Masakazu",...,"Tsuruno Takeshi,Kinomoto Ryo,Fukawa Toshikazu,...","Action,Tokusatsu,Sci-Fi","Superhero,Kaiju,Ultraman",In the year AD 2017 (nine years after the fina...,https://i.mydramalist.com/rZEEj_4c.jpg?v=1,1997,,,,
4798,iron pear,Drama,China,43.0,2010,GRT,1 hr. 30 min.,18,Guo Jing Yu,,...,"Chen Shu,Wei Zi,Zhang Shao Hua,Yang Zhi Gang,J...","Action,Romance,Drama",,This show is about the trials and tribulations...,https://i.mydramalist.com/ONPvLc.jpg?v=1,2009,2010.0,Drama,iron pear,12507.0
4799,team batista 3,Drama,Japan,11.0,2011,Fuji TV,45 min.,122,"Imai Kazuhisa, Komatsu Takashi, Hoshino Kazu...","Goto Noriko, Tanaka Shinichi",...,"Nakamura Toru,Ito Atsushi,Natori Yuko,Takahash...","Mystery,Medical","Adapted From A Novel,Investigation",A murder occurs inside a new MRI model in conn...,https://i.mydramalist.com/MABZXc.jpg?v=1,2010,2011.0,Drama,team batista 3,2701.0
4800,la grande chaumiere violette,Drama,Taiwan,22.0,2016,SET TV,1 hr. 30 min.,118,Nelson Yeh,,...,"Ko Chia Yen,Eli Shi,Grace Lin,Bryant Lee,Fion ...",,,"San Francisco, 1980. 72-year-old Guo Xue Hu ru...",https://i.mydramalist.com/2Qv82c.jpg?v=1,2015,2016.0,Drama,la grande chaumiere violette,13872.0


In [15]:
df.reset_index(drop=True, inplace=True)
exact_match.reset_index(drop=True, inplace=True)

In [16]:
exact_match.columns

Index(['Name', 'category', 'country', 'num_episodes', 'aired',
       'orginal_network', 'duration', 'watchers', 'director', 'screenwriter',
       'rating', 'num_raters', 'cast_names', 'genre_names', 'tag_names',
       'synopsis', 'url', 'adjusted_year', 'year', 'kind', 'title', 'sid'],
      dtype='object')

In [17]:
# Mark the rows that have been merged, the ones that have NAN values marked FALSE
exact_match['merged'] = exact_match['title'].notna()
exact_match.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4802 entries, 0 to 4801
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Name             4802 non-null   object  
 1   category         4802 non-null   object  
 2   country          4802 non-null   object  
 3   num_episodes     4802 non-null   float64 
 4   aired            4802 non-null   int64   
 5   orginal_network  4465 non-null   object  
 6   duration         4657 non-null   object  
 7   watchers         4802 non-null   int64   
 8   director         3680 non-null   object  
 9   screenwriter     2869 non-null   object  
 10  rating           4802 non-null   float64 
 11  num_raters       4802 non-null   int64   
 12  cast_names       4798 non-null   object  
 13  genre_names      4774 non-null   object  
 14  tag_names        4802 non-null   object  
 15  synopsis         4677 non-null   object  
 16  url              4802 non-null   object  


In [18]:
exact_match.head()

Unnamed: 0,Name,category,country,num_episodes,aired,orginal_network,duration,watchers,director,screenwriter,...,genre_names,tag_names,synopsis,url,adjusted_year,year,kind,title,sid,merged
0,tunnel,Drama,South Korea,16.0,2017,OCN,60 min.,32651,"Nam Ki Hoon, Shin Yong Hwi",Lee Eun Mi,...,"Thriller,Mystery,Sci-Fi,Fantasy","Time Travel,Murder,Criminal Profiler,Serial K...","In 1986, Park Gwang Ho works as an excellent a...",https://i.mydramalist.com/JkryYc.jpg?v=1,2016,2017.0,Drama,tunnel,15673.0,True
1,twenty-five twenty-one,Drama,South Korea,16.0,2022,Netflix tvN,1 hr. 13 min.,16043,Jung Ji Hyun,Kwon Do Eun,...,"Romance,Life,Youth,Drama","Athlete,Fencing,1990s,Coming Of Age,Bold Fema...",The story is set in 1998 and tells the stories...,https://i.mydramalist.com/ROOPo_4c.jpg?v=1,2021,2022.0,Drama,twenty-five twenty-one,40541.0,True
2,dr. romantic 2,Drama,South Korea,16.0,2020,SBS,1 hr. 10 min.,31855,"Yoo In Shik, Lee Gil Bok",Kang Eun Kyung,...,"Romance,Drama,Medical,Melodrama","Hospital,Smart Male Lead,Character Developmen...","A real doctor story set in a small, humble h...",https://i.mydramalist.com/Rr7DEc.jpg?v=1,2019,,,,,False
3,move to heaven,Drama,South Korea,10.0,2021,Netflix,52 min.,40962,Kim Sung Ho,Yoon Ji Ryun,...,"Life,Drama,Family","Autism,Uncle-Nephew Relationship,Death,Savant...",Geu Roo is a young autistic man. He works for ...,https://i.mydramalist.com/Rle36_4c.jpg?v=1,2020,2021.0,Drama,move to heaven,29419.0,True
4,crash landing on you,Drama,South Korea,16.0,2020,Netflix tvN,1 hr. 25 min.,110352,Lee Jeong Hyo,Park Ji Eun,...,"Military,Comedy,Romance,Political","North And South Korea,Star Crossed Lovers,Str...","After getting into a paragliding accident, Sou...",https://i.mydramalist.com/XrN2dc.jpg?v=1,2019,,,,,False


In [21]:
df[df['Name'].str.contains('crash landing')]

Unnamed: 0,Name,category,country,num_episodes,aired,orginal_network,duration,watchers,director,screenwriter,rating,num_raters,cast_names,genre_names,tag_names,synopsis,url,adjusted_year
4,crash landing on you,Drama,South Korea,16.0,2020,Netflix tvN,1 hr. 25 min.,110352,Lee Jeong Hyo,Park Ji Eun,9.0,62174,"Hyun Bin,Son Ye Jin,Seo Ji Hye,Kim Jung Hyun,Y...","Military,Comedy,Romance,Political","North And South Korea,Star Crossed Lovers,Str...","After getting into a paragliding accident, Sou...",https://i.mydramalist.com/XrN2dc.jpg?v=1,2019


In [22]:
shows[shows['title'].str.contains('crash landing')]

Unnamed: 0,country,year,kind,title,sid
50,South Korea,2019,Drama,crash landing on you,26304


### looking at the above, we can see there there is a discrepancy in the years
### one is listing the "aired" and the other is just listing the year. this could be due to various reasons, ie. the drama aired late in the year and continued to the next, mydramalist website listed when it first was added to their website, etc. 

### In order to fix this, we will add another column with a year -1 to see if we can match the dramas that list all the other correct data. 

In [None]:
# adding adjusted year column 
df['adjusted_year'] = df['aired'] -1

In [274]:
# Perform the second merge (adjusted year) for unmatched rows
adjusted_match = pd.merge(df[~exact_match['merged']], shows, left_on=['Name', 'country', 'category', 'adjusted_year'], right_on=['title', 'country', 'kind','year'], how='left')

adjusted_match


  adjusted_match = pd.merge(df[~exact_match['merged']], shows, left_on=['Name', 'country', 'category', 'adjusted_year'], right_on=['title', 'country', 'kind','year'], how='left')


Unnamed: 0,Name,category,country,num_episodes,aired,orginal_network,duration,watchers,director,screenwriter,...,cast_names,genre_names,tag_names,synopsis,url,adjusted_year,year,kind,title,sid
0,crash landing on you,Drama,South Korea,16.0,2020,Netflix tvN,1 hr. 25 min.,110352,Lee Jeong Hyo,Park Ji Eun,...,"Hyun Bin,Son Ye Jin,Seo Ji Hye,Kim Jung Hyun,Y...","Military,Comedy,Romance,Political","North And South Korea,Star Crossed Lovers,Str...","After getting into a paragliding accident, Sou...",https://i.mydramalist.com/XrN2dc.jpg?v=1,2019,2019.0,Drama,crash landing on you,26304.0
1,not me,Drama,Thailand,14.0,2022,GMM 25,45 min.,16719,Nuchy Anucha Boonyawatana,"Noolek Sureechay Kaewses, M Rittikrai Kanjana...",...,"Gun Atthaphan Phunsawat,Off Jumpol Adulkittipo...","Action,Thriller,Mystery,Romance","Hidden Identity,Suspense,Political,Identical ...",Black and White are twins with a powerful conn...,https://i.mydramalist.com/0kjbY_4c.jpg?v=1,2021,2021.0,Drama,not me,38160.0
2,sky castle,Drama,South Korea,20.0,2019,jTBC,1 hr. 15 min.,49510,Jo Hyun Taek,Yoo Hyun Mi,...,"Yeom Jung Ah,Lee Tae Ran,Yoon Se Ah,Oh Na Ra,K...","Mystery,Psychological,Drama,Family","Multiple Mains,Social Commentary,Sibling,Fami...","Han Seo Jin, Noh Seung Hye, Jin Jin Hee, and L...",https://i.mydramalist.com/4053wc.jpg?v=1,2018,2018.0,Drama,sky castle,22877.0
3,the red sleeve,Drama,South Korea,17.0,2022,MBC Viki,1 hr. 20 min.,19620,Jung Ji In,Jung Hae Ri,...,"Lee Jun Ho,Lee Se Young,Kang Hoon,Lee Deok Hwa...","Historical,Romance,Drama,Melodrama","Biographical,Strong Male Lead,Male Chases Fem...","In Korea during the second half of the 1700s, ...",https://i.mydramalist.com/2WpKk_4c.jpg?v=1,2021,2021.0,Drama,the red sleeve,33230.0
4,just between lovers,Drama,South Korea,16.0,2018,jTBC,1 hr. 13 min.,59031,Kim Jin Won,Yoo Bo Ra,...,"Lee Jun Ho,Won Jin Ah,Lee Ki Woo,Kang Han Na,K...","Psychological,Romance,Melodrama","Trauma,Healing,Survivor Guilt,Hardworking Fem...",A major accident takes the lives of 48 people....,https://i.mydramalist.com/rkKPmc.jpg?v=1,2017,2017.0,Drama,just between lovers,17704.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
955,ace troops,Drama,China,40.0,2022,iQiyi JSTV,45 min.,2995,,,...,"Johnny Huang,Xiao Zhan,Elaine Zhong,Li You Bin...","Action,Military,War","Army Officer,Army,Military Training,Soldier,R...",Gao Liang and Gu Yi Ye are recruited into the ...,https://i.mydramalist.com/XoRqp_4c.jpg?v=1,2021,2021.0,Drama,ace troops,34883.0
956,kasouken no onna season 5,Drama,Japan,9.0,2004,TV Asahi,45 min.,27,,,...,"Sawaguchi Yasuko,Izumi Masayuki,Naito Takashi,...","Thriller,Mystery,Medical","Forensic Medical Examiner,Forensic,Investigat...",As criminals become more sophisticated and cri...,https://i.mydramalist.com/G18kRc.jpg?v=1,2003,,,,
957,grow up,Drama,China,38.0,2015,Dragon TV,45 min.,502,Lin Yan,Zhang Wei,...,"Lu Yi,Bai Bai He,Fenny Wu,Fang Xiao Li,Liu Lu,...","Romance,Drama,Medical","Doctor Female Lead,Doctor Male Lead,Hardworki...",Ye Chun Meng is a talented medical intern who ...,https://i.mydramalist.com/Rjykxc.jpg?v=1,2014,,,,
958,khun por rub jang,Drama,Thailand,15.0,2004,Channel 7,2 hr. 0 min.,32,,,...,"Num Sornram Theappitak,Kob Suvanant Kongying,P...","Comedy,Romance",,Natcha is a lawyer and a single mother. She wa...,https://i.mydramalist.com/Yoxqgc.jpg?v=1,2003,,,,


In [275]:
adjusted_match.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 960 entries, 0 to 959
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Name             960 non-null    object  
 1   category         960 non-null    object  
 2   country          960 non-null    object  
 3   num_episodes     960 non-null    float64 
 4   aired            960 non-null    int64   
 5   orginal_network  893 non-null    object  
 6   duration         921 non-null    object  
 7   watchers         960 non-null    int64   
 8   director         718 non-null    object  
 9   screenwriter     560 non-null    object  
 10  rating           960 non-null    float64 
 11  num_raters       960 non-null    int64   
 12  cast_names       958 non-null    object  
 13  genre_names      956 non-null    object  
 14  tag_names        960 non-null    object  
 15  synopsis         927 non-null    object  
 16  url              960 non-null    object  
 1

In [276]:
# the problem i think is adding this new year, kind, title, sid data to the original dataset... 
# it shouldnt create new rows, it should add to the original data of the ones with rows that are false? 
## will merge them anyways, and then check duplicates with the same names and such, then remove the ones with NA ? 


In [277]:
adjusted_match.head(50)

Unnamed: 0,Name,category,country,num_episodes,aired,orginal_network,duration,watchers,director,screenwriter,...,cast_names,genre_names,tag_names,synopsis,url,adjusted_year,year,kind,title,sid
0,crash landing on you,Drama,South Korea,16.0,2020,Netflix tvN,1 hr. 25 min.,110352,Lee Jeong Hyo,Park Ji Eun,...,"Hyun Bin,Son Ye Jin,Seo Ji Hye,Kim Jung Hyun,Y...","Military,Comedy,Romance,Political","North And South Korea,Star Crossed Lovers,Str...","After getting into a paragliding accident, Sou...",https://i.mydramalist.com/XrN2dc.jpg?v=1,2019,2019.0,Drama,crash landing on you,26304.0
1,not me,Drama,Thailand,14.0,2022,GMM 25,45 min.,16719,Nuchy Anucha Boonyawatana,"Noolek Sureechay Kaewses, M Rittikrai Kanjana...",...,"Gun Atthaphan Phunsawat,Off Jumpol Adulkittipo...","Action,Thriller,Mystery,Romance","Hidden Identity,Suspense,Political,Identical ...",Black and White are twins with a powerful conn...,https://i.mydramalist.com/0kjbY_4c.jpg?v=1,2021,2021.0,Drama,not me,38160.0
2,sky castle,Drama,South Korea,20.0,2019,jTBC,1 hr. 15 min.,49510,Jo Hyun Taek,Yoo Hyun Mi,...,"Yeom Jung Ah,Lee Tae Ran,Yoon Se Ah,Oh Na Ra,K...","Mystery,Psychological,Drama,Family","Multiple Mains,Social Commentary,Sibling,Fami...","Han Seo Jin, Noh Seung Hye, Jin Jin Hee, and L...",https://i.mydramalist.com/4053wc.jpg?v=1,2018,2018.0,Drama,sky castle,22877.0
3,the red sleeve,Drama,South Korea,17.0,2022,MBC Viki,1 hr. 20 min.,19620,Jung Ji In,Jung Hae Ri,...,"Lee Jun Ho,Lee Se Young,Kang Hoon,Lee Deok Hwa...","Historical,Romance,Drama,Melodrama","Biographical,Strong Male Lead,Male Chases Fem...","In Korea during the second half of the 1700s, ...",https://i.mydramalist.com/2WpKk_4c.jpg?v=1,2021,2021.0,Drama,the red sleeve,33230.0
4,just between lovers,Drama,South Korea,16.0,2018,jTBC,1 hr. 13 min.,59031,Kim Jin Won,Yoo Bo Ra,...,"Lee Jun Ho,Won Jin Ah,Lee Ki Woo,Kang Han Na,K...","Psychological,Romance,Melodrama","Trauma,Healing,Survivor Guilt,Hardworking Fem...",A major accident takes the lives of 48 people....,https://i.mydramalist.com/rkKPmc.jpg?v=1,2017,2017.0,Drama,just between lovers,17704.0
5,healer,Drama,South Korea,20.0,2015,KBS2,60 min.,106425,"Lee Jung Sub, Kim Jin Woo",Song Ji Na,...,"Ji Chang Wook,Park Min Young,Yoo Ji Tae,Kim M...","Action,Thriller,Mystery,Romance","Double Identity,Hidden Identity,Smart Male Le...",Seo Jung Hoo is a special kind of night courie...,https://i.mydramalist.com/lBOo4c.jpg?v=1,2014,2014.0,Drama,healer,9779.0
6,our beloved summer,Drama,South Korea,16.0,2022,Netflix SBS,60 min.,32152,Kim Yoon Jin,Lee Na Eun,...,"Choi Woo Shik,Kim Da Mi,Kim Sung Cheol,Roh Je...","Romance,Youth,Drama","Lovers Reunited,High School Sweethearts,Heali...",Years after filming a viral documentary in hig...,https://i.mydramalist.com/4QLgQ_4c.jpg?v=1,2021,2021.0,Drama,our beloved summer,40370.0
7,ultimate note,Drama,China,37.0,2021,iQiyi,45 min.,5140,"Zou Xi, Wei Li Zhou","Zhang Yuan Ang, Tian Liang Liang",...,"Joseph Zeng,Xiao Yu Liang,Cheng Fang Xu,Liu Yu...","Action,Adventure,Thriller,Mystery","Tomb Raiding,Bromance,Survival,Smart Male Lea...",Wu Xie and Wang Pan Zi are trying to find out ...,https://i.mydramalist.com/prEne_4c.jpg?v=1,2020,2020.0,Drama,ultimate note,30023.0
8,empress ki,Drama,South Korea,51.0,2014,MBC,1 hr. 5 min.,33394,"Han Hee, Lee Sung Joon","Jang Young Chul, Jung Kyung Soon",...,"Ha Ji Won,Ji Chang Wook,Joo Jin Mo,Baek Jin He...","Historical,Romance,Melodrama,Political","Smart Female Lead,Strong Female Lead,Power St...",Genghis Khan built an empire that spanned vast...,https://i.mydramalist.com/JBpElc.jpg?v=1,2013,2013.0,Drama,empress ki,6540.0
9,prison playbook,Drama,South Korea,16.0,2018,Netflix tvN,1 hr. 32 min.,52804,Shin Won Ho,"Jung Bo Hoon, Lee Woo Jung",...,"Park Hae Soo,Jung Kyung Ho,Krystal Jung,Im Hwa...","Comedy,Crime,Life,Drama","Prison,Bromance,Wrongfully Accused,Life Lesso...","Kim Je Hyuk, a famous baseball player, is arre...",https://i.mydramalist.com/xXK7yc.jpg?v=1,2017,2017.0,Drama,prison playbook,16435.0


In [278]:
# taking only the ones with sid data 
adjusted_match = adjusted_match[adjusted_match['title'].notna() & adjusted_match['sid'].notna()]

In [279]:
adjusted_match.head()

Unnamed: 0,Name,category,country,num_episodes,aired,orginal_network,duration,watchers,director,screenwriter,...,cast_names,genre_names,tag_names,synopsis,url,adjusted_year,year,kind,title,sid
0,crash landing on you,Drama,South Korea,16.0,2020,Netflix tvN,1 hr. 25 min.,110352,Lee Jeong Hyo,Park Ji Eun,...,"Hyun Bin,Son Ye Jin,Seo Ji Hye,Kim Jung Hyun,Y...","Military,Comedy,Romance,Political","North And South Korea,Star Crossed Lovers,Str...","After getting into a paragliding accident, Sou...",https://i.mydramalist.com/XrN2dc.jpg?v=1,2019,2019.0,Drama,crash landing on you,26304.0
1,not me,Drama,Thailand,14.0,2022,GMM 25,45 min.,16719,Nuchy Anucha Boonyawatana,"Noolek Sureechay Kaewses, M Rittikrai Kanjana...",...,"Gun Atthaphan Phunsawat,Off Jumpol Adulkittipo...","Action,Thriller,Mystery,Romance","Hidden Identity,Suspense,Political,Identical ...",Black and White are twins with a powerful conn...,https://i.mydramalist.com/0kjbY_4c.jpg?v=1,2021,2021.0,Drama,not me,38160.0
2,sky castle,Drama,South Korea,20.0,2019,jTBC,1 hr. 15 min.,49510,Jo Hyun Taek,Yoo Hyun Mi,...,"Yeom Jung Ah,Lee Tae Ran,Yoon Se Ah,Oh Na Ra,K...","Mystery,Psychological,Drama,Family","Multiple Mains,Social Commentary,Sibling,Fami...","Han Seo Jin, Noh Seung Hye, Jin Jin Hee, and L...",https://i.mydramalist.com/4053wc.jpg?v=1,2018,2018.0,Drama,sky castle,22877.0
3,the red sleeve,Drama,South Korea,17.0,2022,MBC Viki,1 hr. 20 min.,19620,Jung Ji In,Jung Hae Ri,...,"Lee Jun Ho,Lee Se Young,Kang Hoon,Lee Deok Hwa...","Historical,Romance,Drama,Melodrama","Biographical,Strong Male Lead,Male Chases Fem...","In Korea during the second half of the 1700s, ...",https://i.mydramalist.com/2WpKk_4c.jpg?v=1,2021,2021.0,Drama,the red sleeve,33230.0
4,just between lovers,Drama,South Korea,16.0,2018,jTBC,1 hr. 13 min.,59031,Kim Jin Won,Yoo Bo Ra,...,"Lee Jun Ho,Won Jin Ah,Lee Ki Woo,Kang Han Na,K...","Psychological,Romance,Melodrama","Trauma,Healing,Survivor Guilt,Hardworking Fem...",A major accident takes the lives of 48 people....,https://i.mydramalist.com/rkKPmc.jpg?v=1,2017,2017.0,Drama,just between lovers,17704.0


In [280]:
# combining the results from both merges
final_merged_df = pd.concat([exact_match, adjusted_match])

In [281]:
final_merged_df.reset_index(drop=True, inplace=True)

In [282]:
final_merged_df.columns

Index(['Name', 'category', 'country', 'num_episodes', 'aired',
       'orginal_network', 'duration', 'watchers', 'director', 'screenwriter',
       'rating', 'num_raters', 'cast_names', 'genre_names', 'tag_names',
       'synopsis', 'url', 'adjusted_year', 'year', 'kind', 'title', 'sid',
       'merged'],
      dtype='object')

In [283]:
duplicate_columns = ['Name', 'category', 'country', 'num_episodes', 'aired']

# Find all duplicates based on these columns
duplicates = final_merged_df[final_merged_df.duplicated(subset=duplicate_columns, keep=False)]

# Sort the duplicates DataFrame by the duplicate columns
sorted_duplicates = duplicates.sort_values(by=duplicate_columns)
sorted_duplicates

Unnamed: 0,Name,category,country,num_episodes,aired,orginal_network,duration,watchers,director,screenwriter,...,genre_names,tag_names,synopsis,url,adjusted_year,year,kind,title,sid,merged
877,55:15 never too late,Drama,Thailand,16.0,2022,GMM 25,45 min.,3632,Saranyu Jiralaksanakul,"Manow Waneepan Ounphoklang, Jarinee Thanomyat...",...,"Drama,Sci-Fi,Fantasy","Multiple Mains,High School,Second Chance,Voic...",How many of us remember what our dreams were w...,https://i.mydramalist.com/EV3jm_4c.jpg?v=1,2021,,,,,False
4840,55:15 never too late,Drama,Thailand,16.0,2022,GMM 25,45 min.,3632,Saranyu Jiralaksanakul,"Manow Waneepan Ounphoklang, Jarinee Thanomyat...",...,"Drama,Sci-Fi,Fantasy","Multiple Mains,High School,Second Chance,Voic...",How many of us remember what our dreams were w...,https://i.mydramalist.com/EV3jm_4c.jpg?v=1,2021,2021.0,Drama,55:15 never too late,38154.0,
4525,8 love stories,Drama,South Korea,16.0,2000,SBS,60 min.,294,"Kim Jong Hyeok, Lee Kang Hoon",Song Ji Na,...,"Romance,Drama",Omnibus,The drama describes 8 different love stories. ...,https://i.mydramalist.com/YYz2gc.jpg?v=1,1999,,,,,False
4938,8 love stories,Drama,South Korea,16.0,2000,SBS,60 min.,294,"Kim Jong Hyeok, Lee Kang Hoon",Song Ji Na,...,"Romance,Drama",Omnibus,The drama describes 8 different love stories. ...,https://i.mydramalist.com/YYz2gc.jpg?v=1,1999,1999.0,Drama,8 love stories,9654.0,
4094,90's beijing fantasy,Drama,China,24.0,2019,,25 min.,135,,,...,"School,Drama,Family",Youth,A nostalgic coming-of-age tale that follows yo...,https://i.mydramalist.com/2xBK2c.jpg?v=1,2018,,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1083,youth,Drama,China,26.0,2018,Youku,45 min.,5793,Ding Zi Guang,Zhang Yun,...,"Mystery,Comedy,Romance,Drama","Sismance,Female Centered Plot,Housemates,Mult...","When five college students share a house, they...",https://i.mydramalist.com/ly1NNc.jpg?v=1,2017,2018.0,Drama,youth,17952.0,True
4254,ysabella,Drama,Philippines,145.0,2008,ABS-CBN,30 min.,8,"Cathy Garcia Molina, Rory Quintos, Don Cuaresma",,...,"Comedy,Romance,Drama","Family,Food",The story revolves around a young cook named Y...,https://i.mydramalist.com/0R73O_4c.jpg?v=1,2007,,,,,False
4926,ysabella,Drama,Philippines,145.0,2008,ABS-CBN,30 min.,8,"Cathy Garcia Molina, Rory Quintos, Don Cuaresma",,...,"Comedy,Romance,Drama","Family,Food",The story revolves around a young cook named Y...,https://i.mydramalist.com/0R73O_4c.jpg?v=1,2007,2007.0,Drama,ysabella,34940.0,
1671,zyuden sentai kyoryuger,Drama,Japan,48.0,2014,TV Asahi,24 min.,931,"Sakamoto Koichi, Shibasaki Takayuki, Watanab...",Sanjo Riku,...,Tokusatsu,"Superhero,Father-Son Relationship,Slight Roma...","Long ago, in the time of the dinosaurs, the Ea...",https://i.mydramalist.com/p63vE_4c.jpg?v=1,2013,,,,,False


In [284]:
# lookin at this i should be able to find all duplicates with nan in the last columns and remove these instances! 
# so lets look at how we can remove the ones that are in the duplicate ones from the main df WITHOUT removing 
# all the columns with nan values in these columns as I still want to be able to fix some popular titles with name descrepancies



In [285]:
## trying to remove the duplicate values with nan in the sid column

# identifying duplicates
duplicate_columns = ['Name', 'category', 'country', 'num_episodes', 'aired']
duplicates = final_merged_df.duplicated(subset=duplicate_columns, keep=False)

#creating a df with all duplicates
duplicate_df = final_merged_df[duplicates]

# filter for NaN values in specific columns
nan_columns = ['year', 'kind', 'title', 'sid']
nan_duplicates = duplicate_df[nan_columns].isna().all(axis=1)

# rows to be removed are the ones which are duplicates and have NaN in the specific columns
rows_to_remove = duplicate_df[nan_duplicates].index

# remove the filtered duplicates
final_merged_df = final_merged_df.drop(index=rows_to_remove)

# resetting index
final_merged_df.reset_index(drop=True, inplace=True)


In [286]:
final_merged_df.head(30)

Unnamed: 0,Name,category,country,num_episodes,aired,orginal_network,duration,watchers,director,screenwriter,...,genre_names,tag_names,synopsis,url,adjusted_year,year,kind,title,sid,merged
0,tunnel,Drama,South Korea,16.0,2017,OCN,60 min.,32651,"Nam Ki Hoon, Shin Yong Hwi",Lee Eun Mi,...,"Thriller,Mystery,Sci-Fi,Fantasy","Time Travel,Murder,Criminal Profiler,Serial K...","In 1986, Park Gwang Ho works as an excellent a...",https://i.mydramalist.com/JkryYc.jpg?v=1,2016,2017.0,Drama,tunnel,15673.0,True
1,twenty-five twenty-one,Drama,South Korea,16.0,2022,Netflix tvN,1 hr. 13 min.,16043,Jung Ji Hyun,Kwon Do Eun,...,"Romance,Life,Youth,Drama","Athlete,Fencing,1990s,Coming Of Age,Bold Fema...",The story is set in 1998 and tells the stories...,https://i.mydramalist.com/ROOPo_4c.jpg?v=1,2021,2022.0,Drama,twenty-five twenty-one,40541.0,True
2,dr. romantic season 2,Drama,South Korea,16.0,2020,SBS,1 hr. 10 min.,31855,"Yoo In Shik, Lee Gil Bok",Kang Eun Kyung,...,"Romance,Drama,Medical,Melodrama","Hospital,Smart Male Lead,Character Developmen...","A real doctor story set in a small, humble h...",https://i.mydramalist.com/Rr7DEc.jpg?v=1,2019,2020.0,Drama,dr. romantic season 2,26981.0,True
3,move to heaven,Drama,South Korea,10.0,2021,Netflix,52 min.,40962,Kim Sung Ho,Yoon Ji Ryun,...,"Life,Drama,Family","Autism,Uncle-Nephew Relationship,Death,Savant...",Geu Roo is a young autistic man. He works for ...,https://i.mydramalist.com/Rle36_4c.jpg?v=1,2020,2021.0,Drama,move to heaven,29419.0,True
4,the king’s avatar,Drama,China,40.0,2019,Tencent Video,45 min.,24236,"Shiyiyue, Zhang Xiao An","Qiao Bing Qing, Zhou Miao, Li Zhen",...,"Action,Friendship,Youth,Sports","Online Gaming,Strong Friendship,Smart Male Le...","In the multiplayer online game Glory, Ye Xiu i...",https://i.mydramalist.com/2O0xEc.jpg?v=1,2018,2019.0,Drama,the king’s avatar,17517.0,True
5,unnatural,Drama,Japan,10.0,2018,TBS,45 min.,6699,"Tsukahara Ayuko, Takemura Kentaro, Murao Yos...",Nogi Akiko,...,"Mystery,Medical","Forensic,Smart Female Lead,Death,Strong Femal...",Unnatural is a case-of-the-week medical myster...,https://i.mydramalist.com/jYgxb_4c.jpg?v=1,2017,2018.0,Drama,unnatural,18563.0,True
6,rebel: thief who stole the people,Drama,South Korea,30.0,2017,MBC,60 min.,14364,Kim Jin Man,Hwang Jin Young,...,"Action,Thriller,Historical,Romance","Rebellion,Joseon Dynasty,Strong Male Lead,Sma...",The culprit who is posing as a royal official...,https://i.mydramalist.com/Red2Vc.jpg?v=1,2016,2017.0,Drama,rebel: thief who stole the people,15203.0,True
7,you are my hero,Drama,China,40.0,2021,iQiyi Tencent Video Youku,45 min.,17259,Zhang Tong,Qin Wen,...,"Military,Romance,Drama,Medical","Adapted From A Novel,Male Chases Female First...",Freshly graduated doctor Mi Ka was at a jewell...,https://i.mydramalist.com/RpDez_4c.jpg?v=1,2020,2021.0,Drama,you are my hero,29595.0,True
8,love and redemption,Drama,China,59.0,2020,Mango TV Tencent Video Youku,45 min.,15966,"Yin Tao, Mai Guan Zhi",Liu Fang,...,"Historical,Romance,Wuxia,Fantasy","Reincarnation,Deity,Female Warrior,Hidden Ide...",A thousand years since the battle between the ...,https://i.mydramalist.com/BoB65_4c.jpg?v=1,2019,2020.0,Drama,love and redemption,27491.0,True
9,hospital playlist,Drama,South Korea,12.0,2020,Netflix tvN,1 hr. 30 min.,63124,Shin Won Ho,Lee Woo Jung,...,"Friendship,Romance,Life,Medical","Strong Friendship,Multiple Mains,Best Friends...",The stories of people going through their days...,https://i.mydramalist.com/RXXL6_4c.jpg?v=1,2019,2020.0,Drama,hospital playlist,26436.0,True


In [287]:
final_merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Name             4803 non-null   object  
 1   category         4803 non-null   object  
 2   country          4803 non-null   object  
 3   num_episodes     4803 non-null   float64 
 4   aired            4803 non-null   int64   
 5   orginal_network  4466 non-null   object  
 6   duration         4658 non-null   object  
 7   watchers         4803 non-null   int64   
 8   director         3681 non-null   object  
 9   screenwriter     2869 non-null   object  
 10  rating           4803 non-null   float64 
 11  num_raters       4803 non-null   int64   
 12  cast_names       4799 non-null   object  
 13  genre_names      4775 non-null   object  
 14  tag_names        4803 non-null   object  
 15  synopsis         4678 non-null   object  
 16  url              4803 non-null   object  


In [288]:
## looks like there may be some issues with some name discrepancies. may go through some individually to attach sid 
## should i try to change the names again?? 
rows_with_nan_sid = final_merged_df[pd.isna(final_merged_df['sid'])]
rows_with_nan_sid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 812 entries, 120 to 4648
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Name             812 non-null    object  
 1   category         812 non-null    object  
 2   country          812 non-null    object  
 3   num_episodes     812 non-null    float64 
 4   aired            812 non-null    int64   
 5   orginal_network  752 non-null    object  
 6   duration         779 non-null    object  
 7   watchers         812 non-null    int64   
 8   director         586 non-null    object  
 9   screenwriter     473 non-null    object  
 10  rating           812 non-null    float64 
 11  num_raters       812 non-null    int64   
 12  cast_names       811 non-null    object  
 13  genre_names      807 non-null    object  
 14  tag_names        812 non-null    object  
 15  synopsis         784 non-null    object  
 16  url              812 non-null    object  

In [289]:
rows_with_nan_sid.head(15)

Unnamed: 0,Name,category,country,num_episodes,aired,orginal_network,duration,watchers,director,screenwriter,...,genre_names,tag_names,synopsis,url,adjusted_year,year,kind,title,sid,merged
120,the penthouse: war in life,Drama,South Korea,21.0,2021,SBS,1 hr. 25 min.,33479,Joo Dong Min,Kim Soon Ok,...,"Thriller,Mystery,Drama,Family","Betrayal,Revenge,Greed,Jealousy,Female Center...","The residents of Hera Palace, a luxury penthou...",https://i.mydramalist.com/QyBA2_4c.jpg?v=1,2020,,,,,False
122,snowdrop,Drama,South Korea,16.0,2022,jTBC,1 hr. 30 min.,30949,Jo Hyun Taek,Yoo Hyun Mi,...,"Romance,Drama,Melodrama,Political","1980s,Love At First Sight,Political Intrigue,...","In 1987, a bloody man jumps into a womens uni...",https://i.mydramalist.com/RKBWE_4c.jpg?v=1,2021,,,,,False
128,someday or one day,Drama,Taiwan,13.0,2020,CTV,1 hr. 12 min.,16614,Tien Jen Huang,"Hsin Hui Lin, Chi Feng Chien",...,"Thriller,Romance,Sci-Fi","Time Travel,Doppelganger,Strong Friendship,Fi...",The year is 2019. The 27 year-old Huang Yu Xua...,https://i.mydramalist.com/1L7lyc.jpg?v=1,2019,,,,,False
136,liar game 2,Drama,Japan,9.0,2010,Fuji TV,54 min.,11040,Ohki Ayako,Kuroiwa Tsutomu,...,"Thriller,Psychological,Drama","Moral Dilemma,Betrayal,Debt,Smart Male Lead,G...",The naive Kanzaki Nao and expert swindler Akiy...,https://i.mydramalist.com/0wXZ4c.jpg?v=1,2009,,,,,False
150,pinocchio,Drama,South Korea,20.0,2015,SBS,58 min.,107068,Jo Soo Won,Park Hye Ryun,...,"Thriller,Mystery,Comedy,Romance","News Reporter,Tragic Past,Hidden Identity,Dou...",The idealistic Choi In Ha has her work cut out...,https://i.mydramalist.com/1zDn5_4c.jpg?v=1,2014,,,,,False
154,until we meet again,Drama,Thailand,17.0,2020,LINE TV,45 min.,37295,New Siwaj Sawatmaneekul,,...,"Food,Romance,Youth,Drama","Fated Love,Reincarnated Lovers,Soulmates,Adap...","Thirty years ago, Korn and Intouch were univer...",https://i.mydramalist.com/WBDBPc.jpg?v=1,2019,,,,,False
177,psychopath diary,Drama,South Korea,16.0,2020,tvN,1 hr. 20 min.,20919,Lee Jong Jae,"Ryu Yong Jae, Kim Hwan Chae, Choi Sung Joon",...,"Thriller,Mystery,Comedy,Crime","Serial Killer,Black Comedy,Psychopath,Amnesia...",Yook Dong Shik is a 34-year-old office worker ...,https://i.mydramalist.com/2g4yWc.jpg?v=1,2019,,,,,False
193,code blue 3,Drama,Japan,10.0,2017,Fuji TV,54 min.,3099,"Nishiura Masaki, Tanaka Ryo",Adachi Naoko,...,"Drama,Medical","Hospital,Surgeon,Multiple Mains,Hardworking M...","Kousaku Aizawa, Megumi Shiraishi, Mihoko Hiyam...",https://i.mydramalist.com/gWByoc.jpg?v=1,2016,,,,,False
199,true beauty,Drama,South Korea,16.0,2021,tvN,1 hr. 15 min.,74556,Kim Sang Hyub,Lee Shi Eun,...,"Comedy,Romance,Youth,Drama","Adapted From A Webtoon,Popular Male Lead,Make...",True Beauty is a romantic comedy about a high ...,https://i.mydramalist.com/qP2kK_4c.jpg?v=1,2020,,,,,False
203,kyou kara ore wa!!,Drama,Japan,10.0,2018,NTV,46 min.,3845,Fukuda Yuichi,,...,"Action,Comedy,School,Youth","High School,Student,Adapted From A Manga,Frie...","Two transfer students, Mitsuhashi Takashi and ...",https://i.mydramalist.com/x5n1w_4c.jpg?v=1,2017,,,,,False


In [290]:
## trying to merge again. i can see instances where there really shouldnt be a problem merging... so why isnt it finding
# some drama names! 

final_merged_df['merged'] = final_merged_df['title'].notna()
final_merged_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Name             4803 non-null   object  
 1   category         4803 non-null   object  
 2   country          4803 non-null   object  
 3   num_episodes     4803 non-null   float64 
 4   aired            4803 non-null   int64   
 5   orginal_network  4466 non-null   object  
 6   duration         4658 non-null   object  
 7   watchers         4803 non-null   int64   
 8   director         3681 non-null   object  
 9   screenwriter     2869 non-null   object  
 10  rating           4803 non-null   float64 
 11  num_raters       4803 non-null   int64   
 12  cast_names       4799 non-null   object  
 13  genre_names      4775 non-null   object  
 14  tag_names        4803 non-null   object  
 15  synopsis         4678 non-null   object  
 16  url              4803 non-null   object  


In [295]:
test = final_merged_df[~final_merged_df['merged']]

In [298]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 812 entries, 120 to 4648
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             812 non-null    object 
 1   category         812 non-null    object 
 2   country          812 non-null    object 
 3   num_episodes     812 non-null    float64
 4   aired            812 non-null    int64  
 5   orginal_network  752 non-null    object 
 6   duration         779 non-null    object 
 7   watchers         812 non-null    int64  
 8   director         586 non-null    object 
 9   screenwriter     473 non-null    object 
 10  rating           812 non-null    float64
 11  num_raters       812 non-null    int64  
 12  cast_names       811 non-null    object 
 13  genre_names      807 non-null    object 
 14  tag_names        812 non-null    object 
 15  synopsis         784 non-null    object 
 16  url              812 non-null    object 
 17  adjusted_year

In [297]:
# lets drop the nan value columns 
columns_to_drop = ['title','year', 'kind', 'sid','merged']
test.drop(columns=columns_to_drop, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.drop(columns=columns_to_drop, axis=1, inplace=True)


In [299]:
# again match with un merged columns
adjusted_match = pd.merge(test, shows, left_on=['Name', 'country', 'category', 'adjusted_year'], right_on=['title', 'country', 'kind','year'], how='left')

adjusted_match.head(30)


Unnamed: 0,Name,category,country,num_episodes,aired,orginal_network,duration,watchers,director,screenwriter,...,cast_names,genre_names,tag_names,synopsis,url,adjusted_year,year,kind,title,sid
0,the penthouse: war in life,Drama,South Korea,21.0,2021,SBS,1 hr. 25 min.,33479,Joo Dong Min,Kim Soon Ok,...,"Lee Ji Ah,Kim So Yeon,Eugene,Uhm Ki Joon,Bong ...","Thriller,Mystery,Drama,Family","Betrayal,Revenge,Greed,Jealousy,Female Center...","The residents of Hera Palace, a luxury penthou...",https://i.mydramalist.com/QyBA2_4c.jpg?v=1,2020,2020.0,Drama,the penthouse: war in life,30791.0
1,snowdrop,Drama,South Korea,16.0,2022,jTBC,1 hr. 30 min.,30949,Jo Hyun Taek,Yoo Hyun Mi,...,"Jung Hae In,Kim Ji Soo,Jang Seung Jo,Yoo In Na...","Romance,Drama,Melodrama,Political","1980s,Love At First Sight,Political Intrigue,...","In 1987, a bloody man jumps into a womens uni...",https://i.mydramalist.com/RKBWE_4c.jpg?v=1,2021,2021.0,Drama,snowdrop,33603.0
2,someday or one day,Drama,Taiwan,13.0,2020,CTV,1 hr. 12 min.,16614,Tien Jen Huang,"Hsin Hui Lin, Chi Feng Chien",...,"Ko Chia Yen,Greg Hsu,Patrick Shih,Yan Yu Lin,Z...","Thriller,Romance,Sci-Fi","Time Travel,Doppelganger,Strong Friendship,Fi...",The year is 2019. The 27 year-old Huang Yu Xua...,https://i.mydramalist.com/1L7lyc.jpg?v=1,2019,2019.0,Drama,someday or one day,27511.0
3,liar game 2,Drama,Japan,9.0,2010,Fuji TV,54 min.,11040,Ohki Ayako,Kuroiwa Tsutomu,...,"Toda Erika,Kikuchi Rinko,Matsuda Shota,Takei E...","Thriller,Psychological,Drama","Moral Dilemma,Betrayal,Debt,Smart Male Lead,G...",The naive Kanzaki Nao and expert swindler Akiy...,https://i.mydramalist.com/0wXZ4c.jpg?v=1,2009,2009.0,Drama,liar game 2,40.0
4,pinocchio,Drama,South Korea,20.0,2015,SBS,58 min.,107068,Jo Soo Won,Park Hye Ryun,...,"Lee Jong Suk,Park Shin Hye,Kim Young Kwang,Lee...","Thriller,Mystery,Comedy,Romance","News Reporter,Tragic Past,Hidden Identity,Dou...",The idealistic Choi In Ha has her work cut out...,https://i.mydramalist.com/1zDn5_4c.jpg?v=1,2014,2014.0,Drama,pinocchio,9978.0
5,until we meet again,Drama,Thailand,17.0,2020,LINE TV,45 min.,37295,New Siwaj Sawatmaneekul,,...,"Fluke Natouch Siripongthon,Ohm Thitiwat Ritpra...","Food,Romance,Youth,Drama","Fated Love,Reincarnated Lovers,Soulmates,Adap...","Thirty years ago, Korn and Intouch were univer...",https://i.mydramalist.com/WBDBPc.jpg?v=1,2019,2019.0,Drama,until we meet again,25195.0
6,psychopath diary,Drama,South Korea,16.0,2020,tvN,1 hr. 20 min.,20919,Lee Jong Jae,"Ryu Yong Jae, Kim Hwan Chae, Choi Sung Joon",...,"Yoon Shi Yoon,Jung In Sun,Park Sung Hoon,Lee ...","Thriller,Mystery,Comedy,Crime","Serial Killer,Black Comedy,Psychopath,Amnesia...",Yook Dong Shik is a 34-year-old office worker ...,https://i.mydramalist.com/2g4yWc.jpg?v=1,2019,2019.0,Drama,psychopath diary,26706.0
7,code blue 3,Drama,Japan,10.0,2017,Fuji TV,54 min.,3099,"Nishiura Masaki, Tanaka Ryo",Adachi Naoko,...,"Yamashita Tomohisa,Aragaki Yui,Toda Erika,Higa...","Drama,Medical","Hospital,Surgeon,Multiple Mains,Hardworking M...","Kousaku Aizawa, Megumi Shiraishi, Mihoko Hiyam...",https://i.mydramalist.com/gWByoc.jpg?v=1,2016,,,,
8,true beauty,Drama,South Korea,16.0,2021,tvN,1 hr. 15 min.,74556,Kim Sang Hyub,Lee Shi Eun,...,"Moon Ga Young,Cha Eun Woo,Hwang In Yeop,Park Y...","Comedy,Romance,Youth,Drama","Adapted From A Webtoon,Popular Male Lead,Make...",True Beauty is a romantic comedy about a high ...,https://i.mydramalist.com/qP2kK_4c.jpg?v=1,2020,2020.0,Drama,true beauty,27100.0
9,kyou kara ore wa!!,Drama,Japan,10.0,2018,NTV,46 min.,3845,Fukuda Yuichi,,...,"Kaku Kento,Ito Kentaro,Seino Nana,Hashimoto Ka...","Action,Comedy,School,Youth","High School,Student,Adapted From A Manga,Frie...","Two transfer students, Mitsuhashi Takashi and ...",https://i.mydramalist.com/x5n1w_4c.jpg?v=1,2017,,,,


In [300]:
adjusted_match.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 812 entries, 0 to 811
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Name             812 non-null    object  
 1   category         812 non-null    object  
 2   country          812 non-null    object  
 3   num_episodes     812 non-null    float64 
 4   aired            812 non-null    int64   
 5   orginal_network  752 non-null    object  
 6   duration         779 non-null    object  
 7   watchers         812 non-null    int64   
 8   director         586 non-null    object  
 9   screenwriter     473 non-null    object  
 10  rating           812 non-null    float64 
 11  num_raters       812 non-null    int64   
 12  cast_names       811 non-null    object  
 13  genre_names      807 non-null    object  
 14  tag_names        812 non-null    object  
 15  synopsis         784 non-null    object  
 16  url              812 non-null    object  
 1

In [301]:
# taking only the ones with sid data 
adjusted_match = adjusted_match[adjusted_match['title'].notna() & adjusted_match['sid'].notna()]

In [302]:
adjusted_match.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 456 entries, 0 to 811
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Name             456 non-null    object  
 1   category         456 non-null    object  
 2   country          456 non-null    object  
 3   num_episodes     456 non-null    float64 
 4   aired            456 non-null    int64   
 5   orginal_network  426 non-null    object  
 6   duration         438 non-null    object  
 7   watchers         456 non-null    int64   
 8   director         352 non-null    object  
 9   screenwriter     291 non-null    object  
 10  rating           456 non-null    float64 
 11  num_raters       456 non-null    int64   
 12  cast_names       455 non-null    object  
 13  genre_names      453 non-null    object  
 14  tag_names        456 non-null    object  
 15  synopsis         448 non-null    object  
 16  url              456 non-null    object  
 1

In [303]:
# merging with final df again

second_final_df = pd.concat([final_merged_df, adjusted_match])

In [304]:
second_final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5259 entries, 0 to 811
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Name             5259 non-null   object  
 1   category         5259 non-null   object  
 2   country          5259 non-null   object  
 3   num_episodes     5259 non-null   float64 
 4   aired            5259 non-null   int64   
 5   orginal_network  4892 non-null   object  
 6   duration         5096 non-null   object  
 7   watchers         5259 non-null   int64   
 8   director         4033 non-null   object  
 9   screenwriter     3160 non-null   object  
 10  rating           5259 non-null   float64 
 11  num_raters       5259 non-null   int64   
 12  cast_names       5254 non-null   object  
 13  genre_names      5228 non-null   object  
 14  tag_names        5259 non-null   object  
 15  synopsis         5126 non-null   object  
 16  url              5259 non-null   object  
 

In [305]:
# identify duplicates
duplicate_columns = ['Name', 'category', 'country', 'num_episodes', 'aired']
duplicates = second_final_df.duplicated(subset=duplicate_columns, keep=False)

# create df with all duplicates
duplicate_df = second_final_df[duplicates]

# filter for NaN values in specific columns
nan_columns = ['year', 'kind', 'title', 'sid']
nan_duplicates = duplicate_df[nan_columns].isna().all(axis=1)

# rows to be removed are the ones which are duplicates and have NaN in the specific columns
rows_to_remove = duplicate_df[nan_duplicates].index

# remove the filtered duplicates
second_final_df = second_final_df.drop(index=rows_to_remove)

# resetting index
second_final_df.reset_index(drop=True, inplace=True)

In [307]:
second_final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4768 entries, 0 to 4767
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Name             4768 non-null   object  
 1   category         4768 non-null   object  
 2   country          4768 non-null   object  
 3   num_episodes     4768 non-null   float64 
 4   aired            4768 non-null   int64   
 5   orginal_network  4435 non-null   object  
 6   duration         4624 non-null   object  
 7   watchers         4768 non-null   int64   
 8   director         3657 non-null   object  
 9   screenwriter     2848 non-null   object  
 10  rating           4768 non-null   float64 
 11  num_raters       4768 non-null   int64   
 12  cast_names       4764 non-null   object  
 13  genre_names      4740 non-null   object  
 14  tag_names        4768 non-null   object  
 15  synopsis         4643 non-null   object  
 16  url              4768 non-null   object  


In [308]:
second_final_df['merged'] = second_final_df['title'].notna()
second_final_df.head(30)

Unnamed: 0,Name,category,country,num_episodes,aired,orginal_network,duration,watchers,director,screenwriter,...,genre_names,tag_names,synopsis,url,adjusted_year,year,kind,title,sid,merged
0,tunnel,Drama,South Korea,16.0,2017,OCN,60 min.,32651,"Nam Ki Hoon, Shin Yong Hwi",Lee Eun Mi,...,"Thriller,Mystery,Sci-Fi,Fantasy","Time Travel,Murder,Criminal Profiler,Serial K...","In 1986, Park Gwang Ho works as an excellent a...",https://i.mydramalist.com/JkryYc.jpg?v=1,2016,2017.0,Drama,tunnel,15673.0,True
1,twenty-five twenty-one,Drama,South Korea,16.0,2022,Netflix tvN,1 hr. 13 min.,16043,Jung Ji Hyun,Kwon Do Eun,...,"Romance,Life,Youth,Drama","Athlete,Fencing,1990s,Coming Of Age,Bold Fema...",The story is set in 1998 and tells the stories...,https://i.mydramalist.com/ROOPo_4c.jpg?v=1,2021,2022.0,Drama,twenty-five twenty-one,40541.0,True
2,dr. romantic season 2,Drama,South Korea,16.0,2020,SBS,1 hr. 10 min.,31855,"Yoo In Shik, Lee Gil Bok",Kang Eun Kyung,...,"Romance,Drama,Medical,Melodrama","Hospital,Smart Male Lead,Character Developmen...","A real doctor story set in a small, humble h...",https://i.mydramalist.com/Rr7DEc.jpg?v=1,2019,2020.0,Drama,dr. romantic season 2,26981.0,True
3,move to heaven,Drama,South Korea,10.0,2021,Netflix,52 min.,40962,Kim Sung Ho,Yoon Ji Ryun,...,"Life,Drama,Family","Autism,Uncle-Nephew Relationship,Death,Savant...",Geu Roo is a young autistic man. He works for ...,https://i.mydramalist.com/Rle36_4c.jpg?v=1,2020,2021.0,Drama,move to heaven,29419.0,True
4,the king’s avatar,Drama,China,40.0,2019,Tencent Video,45 min.,24236,"Shiyiyue, Zhang Xiao An","Qiao Bing Qing, Zhou Miao, Li Zhen",...,"Action,Friendship,Youth,Sports","Online Gaming,Strong Friendship,Smart Male Le...","In the multiplayer online game Glory, Ye Xiu i...",https://i.mydramalist.com/2O0xEc.jpg?v=1,2018,2019.0,Drama,the king’s avatar,17517.0,True
5,unnatural,Drama,Japan,10.0,2018,TBS,45 min.,6699,"Tsukahara Ayuko, Takemura Kentaro, Murao Yos...",Nogi Akiko,...,"Mystery,Medical","Forensic,Smart Female Lead,Death,Strong Femal...",Unnatural is a case-of-the-week medical myster...,https://i.mydramalist.com/jYgxb_4c.jpg?v=1,2017,2018.0,Drama,unnatural,18563.0,True
6,rebel: thief who stole the people,Drama,South Korea,30.0,2017,MBC,60 min.,14364,Kim Jin Man,Hwang Jin Young,...,"Action,Thriller,Historical,Romance","Rebellion,Joseon Dynasty,Strong Male Lead,Sma...",The culprit who is posing as a royal official...,https://i.mydramalist.com/Red2Vc.jpg?v=1,2016,2017.0,Drama,rebel: thief who stole the people,15203.0,True
7,you are my hero,Drama,China,40.0,2021,iQiyi Tencent Video Youku,45 min.,17259,Zhang Tong,Qin Wen,...,"Military,Romance,Drama,Medical","Adapted From A Novel,Male Chases Female First...",Freshly graduated doctor Mi Ka was at a jewell...,https://i.mydramalist.com/RpDez_4c.jpg?v=1,2020,2021.0,Drama,you are my hero,29595.0,True
8,love and redemption,Drama,China,59.0,2020,Mango TV Tencent Video Youku,45 min.,15966,"Yin Tao, Mai Guan Zhi",Liu Fang,...,"Historical,Romance,Wuxia,Fantasy","Reincarnation,Deity,Female Warrior,Hidden Ide...",A thousand years since the battle between the ...,https://i.mydramalist.com/BoB65_4c.jpg?v=1,2019,2020.0,Drama,love and redemption,27491.0,True
9,hospital playlist,Drama,South Korea,12.0,2020,Netflix tvN,1 hr. 30 min.,63124,Shin Won Ho,Lee Woo Jung,...,"Friendship,Romance,Life,Medical","Strong Friendship,Multiple Mains,Best Friends...",The stories of people going through their days...,https://i.mydramalist.com/RXXL6_4c.jpg?v=1,2019,2020.0,Drama,hospital playlist,26436.0,True


In [311]:
test = second_final_df[~second_final_df['merged']]

In [313]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356 entries, 186 to 4188
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Name             356 non-null    object  
 1   category         356 non-null    object  
 2   country          356 non-null    object  
 3   num_episodes     356 non-null    float64 
 4   aired            356 non-null    int64   
 5   orginal_network  326 non-null    object  
 6   duration         341 non-null    object  
 7   watchers         356 non-null    int64   
 8   director         234 non-null    object  
 9   screenwriter     182 non-null    object  
 10  rating           356 non-null    float64 
 11  num_raters       356 non-null    int64   
 12  cast_names       356 non-null    object  
 13  genre_names      354 non-null    object  
 14  tag_names        356 non-null    object  
 15  synopsis         336 non-null    object  
 16  url              356 non-null    object  

In [314]:
# lets drop the nan value columns 
columns_to_drop = ['title','year', 'kind', 'sid','merged']
test.drop(columns=columns_to_drop, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.drop(columns=columns_to_drop, axis=1, inplace=True)


In [317]:
# will try again with the merge for the non adjusted year
adjusted_match = pd.merge(test, shows, left_on=['Name', 'country', 'category', 'aired'], right_on=['title', 'country', 'kind','year'], how='left')

adjusted_match


Unnamed: 0,Name,category,country,num_episodes,aired,orginal_network,duration,watchers,director,screenwriter,...,cast_names,genre_names,tag_names,synopsis,url,adjusted_year,year,kind,title,sid
0,code blue 3,Drama,Japan,10.0,2017,Fuji TV,54 min.,3099,"Nishiura Masaki, Tanaka Ryo",Adachi Naoko,...,"Yamashita Tomohisa,Aragaki Yui,Toda Erika,Higa...","Drama,Medical","Hospital,Surgeon,Multiple Mains,Hardworking M...","Kousaku Aizawa, Megumi Shiraishi, Mihoko Hiyam...",https://i.mydramalist.com/gWByoc.jpg?v=1,2016,,,,
1,kyou kara ore wa!!,Drama,Japan,10.0,2018,NTV,46 min.,3845,Fukuda Yuichi,,...,"Kaku Kento,Ito Kentaro,Seino Nana,Hashimoto Ka...","Action,Comedy,School,Youth","High School,Student,Adapted From A Manga,Frie...","Two transfer students, Mitsuhashi Takashi and ...",https://i.mydramalist.com/x5n1w_4c.jpg?v=1,2017,,,,
2,age of youth 2,Drama,South Korea,14.0,2017,jTBC,60 min.,28720,"Lee Tae Gon, Kim Sang Ho",Park Yeon Sun,...,"Han Ye Ri,Han Seung Yeon,Park Eun Bin,Ji Woo,C...","Mystery,Comedy,Romance,Youth","Female Centered Plot,Multiple Mains,Housemate...",Five girls in their 20s share a house called B...,https://i.mydramalist.com/gWz4Wc.jpg?v=1,2016,,,,
3,voice 2,Drama,South Korea,12.0,2018,OCN,60 min.,10538,Lee Seung Young,Ma Jin Won,...,"Lee Ha Na ,Lee Jin Wook,Kwon Yool,Ahn Se Ha,S...","Thriller,Mystery,Psychological","Strong Female Lead,Mad Dog,Leadership,Psychop...","Listen closely, because skilled voice profiler...",https://i.mydramalist.com/xEp0yc.jpg?v=1,2017,,,,
4,naoki 2,Drama,Japan,10.0,2020,TBS,54 min.,1001,"Fukuzawa Katsuo, Tanaka Kenta",Ushio Kentaro,...,"Sakai Masato,Kagawa Teruyuki ,Oikawa Mitsuhiro...","Thriller,Business,Drama","Bank,Adapted From A Novel,Corruption,Bankrupt...","Unconventional banker Hanzawa Naoki, has been ...",https://i.mydramalist.com/kd2ZO_4c.jpg?v=1,2019,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,club friday the series season 8: true loveor ...,Drama,Thailand,4.0,2017,GMM 25,50 min.,131,,,...,"Baifern Pimchanok Luevisadpaibul,Two Popetorn ...","Romance,Life,Drama","Blindness,Warm Female Lead,Nice Female Lead,D...","Via social media, a radio DJ begins a friendsh...",https://i.mydramalist.com/wmk6Yc.jpg?v=1,2016,,,,
352,dr. parks clinic,Drama,South Korea,12.0,2022,,28 min.,851,Seo Joon Bum,,...,"Lee Seo Jin,Ra Mi Ran,Cha Chung Hwa,Shin Eun J...","Comedy,Medical","Doctor Male Lead,Married Couple,Web Series,Ad...",Dr. Park Won Jang recently opened his own inte...,https://i.mydramalist.com/xLXvy_4c.jpg?v=1,2021,,,,
353,wu xin: the monster killer 3,Drama,China,28.0,2020,iQiyi Sohu TV,40 min.,801,"Patrick Yau, Wai Hong Chui","Li Nan, Xu Zi Yuan",...,"Elvis Han,Sebrina Chen,Ding Qiao,Sui Yong Lian...","Historical,Romance,Fantasy","Demon/Monster Hunting,Supernatural,Adventure,...",An immortal Taoist Master with no heart seeks ...,https://i.mydramalist.com/qoL2Pc.jpg?v=1,2019,,,,
354,above the clouds: season 2,Drama,China,25.0,2017,iQiyi,45 min.,224,"Chan Ka Lam, Li Wen Long, Deng Wei En",Yu Zheng,...,"Mabel Yuan,Chen Xiao,Gao Yu Er,Lin Yi Ting,Nan...","Action,Historical,Romance,Life,Melodrama",,,https://i.mydramalist.com/BpARRc.jpg?v=1,2016,,,,


In [318]:
adjusted_match.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356 entries, 0 to 355
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Name             356 non-null    object  
 1   category         356 non-null    object  
 2   country          356 non-null    object  
 3   num_episodes     356 non-null    float64 
 4   aired            356 non-null    int64   
 5   orginal_network  326 non-null    object  
 6   duration         341 non-null    object  
 7   watchers         356 non-null    int64   
 8   director         234 non-null    object  
 9   screenwriter     182 non-null    object  
 10  rating           356 non-null    float64 
 11  num_raters       356 non-null    int64   
 12  cast_names       356 non-null    object  
 13  genre_names      354 non-null    object  
 14  tag_names        356 non-null    object  
 15  synopsis         336 non-null    object  
 16  url              356 non-null    object  
 1

In [319]:
# taking only the ones with sid data 
adjusted_match = adjusted_match[adjusted_match['title'].notna() & adjusted_match['sid'].notna()]

In [320]:
adjusted_match.head(50)

Unnamed: 0,Name,category,country,num_episodes,aired,orginal_network,duration,watchers,director,screenwriter,...,cast_names,genre_names,tag_names,synopsis,url,adjusted_year,year,kind,title,sid


In [321]:
adjusted_match.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Name             0 non-null      object  
 1   category         0 non-null      object  
 2   country          0 non-null      object  
 3   num_episodes     0 non-null      float64 
 4   aired            0 non-null      int64   
 5   orginal_network  0 non-null      object  
 6   duration         0 non-null      object  
 7   watchers         0 non-null      int64   
 8   director         0 non-null      object  
 9   screenwriter     0 non-null      object  
 10  rating           0 non-null      float64 
 11  num_raters       0 non-null      int64   
 12  cast_names       0 non-null      object  
 13  genre_names      0 non-null      object  
 14  tag_names        0 non-null      object  
 15  synopsis         0 non-null      object  
 16  url              0 non-null      object  
 17  adjusted_

In [230]:
# no more to add :( 
#third_final_df = pd.concat([final_merged_df, adjusted_match])

In [322]:
second_final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4768 entries, 0 to 4767
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Name             4768 non-null   object  
 1   category         4768 non-null   object  
 2   country          4768 non-null   object  
 3   num_episodes     4768 non-null   float64 
 4   aired            4768 non-null   int64   
 5   orginal_network  4435 non-null   object  
 6   duration         4624 non-null   object  
 7   watchers         4768 non-null   int64   
 8   director         3657 non-null   object  
 9   screenwriter     2848 non-null   object  
 10  rating           4768 non-null   float64 
 11  num_raters       4768 non-null   int64   
 12  cast_names       4764 non-null   object  
 13  genre_names      4740 non-null   object  
 14  tag_names        4768 non-null   object  
 15  synopsis         4643 non-null   object  
 16  url              4768 non-null   object  


In [325]:
## lets check duplicates : 
duplicate_columns = ['Name', 'category', 'country', 'num_episodes', 'aired', 'sid']
duplicates = second_final_df.duplicated(subset=duplicate_columns, keep=False)

# Create a DataFrame with all duplicates
duplicate_df = second_final_df[duplicates]

In [326]:
duplicate_df

Unnamed: 0,Name,category,country,num_episodes,aired,orginal_network,duration,watchers,director,screenwriter,...,genre_names,tag_names,synopsis,url,adjusted_year,year,kind,title,sid,merged


In [329]:
## will need to check if there are dramas with the same data that have different sids

# Group by the specified columns and count unique 'sid's
grouped = second_final_df.groupby(['Name', 'category', 'country', 'num_episodes', 'aired'])['sid'].nunique()

# Reset index to make 'Name', 'category', 'country', 'num_episodes', 'aired' columns again
grouped = grouped.reset_index(name='unique_sid_count')

# Filter rows where the count of unique 'sid's is more than 1
more_than_one_sid = grouped[grouped['unique_sid_count'] > 1]

# Sort the results by 'unique_sid_count' in descending order
sorted_more_than_one_sid = more_than_one_sid.sort_values(by='unique_sid_count', ascending=False)

# Display the sorted DataFrame
sorted_more_than_one_sid.head(50)


Unnamed: 0,Name,category,country,num_episodes,aired,unique_sid_count
956,fall in love,Drama,China,36.0,2021,2
963,falling in love,Drama,China,12.0,2021,2
2785,oh my boss,Drama,Thailand,14.0,2021,2
2830,once again,Drama,South Korea,100.0,2020,2
3083,puzzle,Drama,Japan,10.0,2008,2
4725,youth,Drama,China,26.0,2018,2


In [331]:
second_final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4768 entries, 0 to 4767
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Name             4768 non-null   object  
 1   category         4768 non-null   object  
 2   country          4768 non-null   object  
 3   num_episodes     4768 non-null   float64 
 4   aired            4768 non-null   int64   
 5   orginal_network  4435 non-null   object  
 6   duration         4624 non-null   object  
 7   watchers         4768 non-null   int64   
 8   director         3657 non-null   object  
 9   screenwriter     2848 non-null   object  
 10  rating           4768 non-null   float64 
 11  num_raters       4768 non-null   int64   
 12  cast_names       4764 non-null   object  
 13  genre_names      4740 non-null   object  
 14  tag_names        4768 non-null   object  
 15  synopsis         4643 non-null   object  
 16  url              4768 non-null   object  


In [333]:
# lets remove all the nan values and save this csv. 
save_this_df = second_final_df[second_final_df['sid'].notna()]

In [334]:
save_this_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4412 entries, 0 to 4767
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Name             4412 non-null   object  
 1   category         4412 non-null   object  
 2   country          4412 non-null   object  
 3   num_episodes     4412 non-null   float64 
 4   aired            4412 non-null   int64   
 5   orginal_network  4109 non-null   object  
 6   duration         4283 non-null   object  
 7   watchers         4412 non-null   int64   
 8   director         3423 non-null   object  
 9   screenwriter     2666 non-null   object  
 10  rating           4412 non-null   float64 
 11  num_raters       4412 non-null   int64   
 12  cast_names       4408 non-null   object  
 13  genre_names      4386 non-null   object  
 14  tag_names        4412 non-null   object  
 15  synopsis         4307 non-null   object  
 16  url              4412 non-null   object  


In [335]:
# save_this_df.to_csv('dfwithsid-30-11.csv', index=False)

In [336]:
reloaddf = pd.read_csv('dfwithsid-30-11.csv')

In [337]:
reloaddf.head()

Unnamed: 0,Name,category,country,num_episodes,aired,orginal_network,duration,watchers,director,screenwriter,...,genre_names,tag_names,synopsis,url,adjusted_year,year,kind,title,sid,merged
0,tunnel,Drama,South Korea,16.0,2017,OCN,60 min.,32651,"Nam Ki Hoon, Shin Yong Hwi",Lee Eun Mi,...,"Thriller,Mystery,Sci-Fi,Fantasy","Time Travel,Murder,Criminal Profiler,Serial K...","In 1986, Park Gwang Ho works as an excellent a...",https://i.mydramalist.com/JkryYc.jpg?v=1,2016,2017.0,Drama,tunnel,15673.0,True
1,twenty-five twenty-one,Drama,South Korea,16.0,2022,Netflix tvN,1 hr. 13 min.,16043,Jung Ji Hyun,Kwon Do Eun,...,"Romance,Life,Youth,Drama","Athlete,Fencing,1990s,Coming Of Age,Bold Fema...",The story is set in 1998 and tells the stories...,https://i.mydramalist.com/ROOPo_4c.jpg?v=1,2021,2022.0,Drama,twenty-five twenty-one,40541.0,True
2,dr. romantic season 2,Drama,South Korea,16.0,2020,SBS,1 hr. 10 min.,31855,"Yoo In Shik, Lee Gil Bok",Kang Eun Kyung,...,"Romance,Drama,Medical,Melodrama","Hospital,Smart Male Lead,Character Developmen...","A real doctor story set in a small, humble h...",https://i.mydramalist.com/Rr7DEc.jpg?v=1,2019,2020.0,Drama,dr. romantic season 2,26981.0,True
3,move to heaven,Drama,South Korea,10.0,2021,Netflix,52 min.,40962,Kim Sung Ho,Yoon Ji Ryun,...,"Life,Drama,Family","Autism,Uncle-Nephew Relationship,Death,Savant...",Geu Roo is a young autistic man. He works for ...,https://i.mydramalist.com/Rle36_4c.jpg?v=1,2020,2021.0,Drama,move to heaven,29419.0,True
4,the king’s avatar,Drama,China,40.0,2019,Tencent Video,45 min.,24236,"Shiyiyue, Zhang Xiao An","Qiao Bing Qing, Zhou Miao, Li Zhen",...,"Action,Friendship,Youth,Sports","Online Gaming,Strong Friendship,Smart Male Le...","In the multiplayer online game Glory, Ye Xiu i...",https://i.mydramalist.com/2O0xEc.jpg?v=1,2018,2019.0,Drama,the king’s avatar,17517.0,True


In [338]:
reloaddf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4412 entries, 0 to 4411
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             4412 non-null   object 
 1   category         4412 non-null   object 
 2   country          4412 non-null   object 
 3   num_episodes     4412 non-null   float64
 4   aired            4412 non-null   int64  
 5   orginal_network  4109 non-null   object 
 6   duration         4283 non-null   object 
 7   watchers         4412 non-null   int64  
 8   director         3423 non-null   object 
 9   screenwriter     2666 non-null   object 
 10  rating           4412 non-null   float64
 11  num_raters       4412 non-null   int64  
 12  cast_names       4408 non-null   object 
 13  genre_names      4386 non-null   object 
 14  tag_names        4412 non-null   object 
 15  synopsis         4307 non-null   object 
 16  url              4412 non-null   object 
 17  adjusted_year 

In [232]:
# Step 1: Identify duplicates
duplicate_columns = ['Name', 'category', 'country', 'num_episodes', 'aired']
duplicates = third_final_df.duplicated(subset=duplicate_columns, keep=False)

# Create a DataFrame with all duplicates
duplicate_df = third_final_df[duplicates]

# Step 2: Filter for NaN values in specific columns
nan_columns = ['year', 'kind', 'title', 'sid']
nan_duplicates = duplicate_df[nan_columns].isna().all(axis=1)

# Rows to be removed are the ones which are duplicates and have NaN in the specific columns
rows_to_remove = duplicate_df[nan_duplicates].index

# Step 3: Remove the filtered duplicates
third_final_df = third_final_df.drop(index=rows_to_remove)

# Reset index if necessary
third_final_df.reset_index(drop=True, inplace=True)

In [233]:
third_final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5371 entries, 0 to 5370
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Name             5371 non-null   object  
 1   category         5371 non-null   object  
 2   country          5371 non-null   object  
 3   num_episodes     5371 non-null   float64 
 4   aired            5371 non-null   int64   
 5   orginal_network  4998 non-null   object  
 6   duration         5209 non-null   object  
 7   watchers         5371 non-null   int64   
 8   director         4107 non-null   object  
 9   screenwriter     3198 non-null   object  
 10  rating           5371 non-null   float64 
 11  num_raters       5371 non-null   int64   
 12  cast_names       5367 non-null   object  
 13  genre_names      5341 non-null   object  
 14  tag_names        5371 non-null   object  
 15  synopsis         5227 non-null   object  
 16  url              5371 non-null   object  


In [238]:
# lets look at the remaining nan values: 
third_final_df['merged'] = third_final_df['title'].notna()
non_match_df = third_final_df[third_final_df['merged']== False]
non_match_df

Unnamed: 0,Name,category,country,num_episodes,aired,orginal_network,duration,watchers,director,screenwriter,...,genre_names,tag_names,synopsis,url,adjusted_year,year,kind,title,sid,merged
120,the penthouse: war in life,Drama,South Korea,21.0,2021,SBS,1 hr. 25 min.,33479,Joo Dong Min,Kim Soon Ok,...,"Thriller,Mystery,Drama,Family","Betrayal,Revenge,Greed,Jealousy,Female Center...","The residents of Hera Palace, a luxury penthou...",https://i.mydramalist.com/QyBA2_4c.jpg?v=1,2020,,,,,False
122,snowdrop,Drama,South Korea,16.0,2022,jTBC,1 hr. 30 min.,30949,Jo Hyun Taek,Yoo Hyun Mi,...,"Romance,Drama,Melodrama,Political","1980s,Love At First Sight,Political Intrigue,...","In 1987, a bloody man jumps into a womens uni...",https://i.mydramalist.com/RKBWE_4c.jpg?v=1,2021,,,,,False
128,someday or one day,Drama,Taiwan,13.0,2020,CTV,1 hr. 12 min.,16614,Tien Jen Huang,"Hsin Hui Lin, Chi Feng Chien",...,"Thriller,Romance,Sci-Fi","Time Travel,Doppelganger,Strong Friendship,Fi...",The year is 2019. The 27 year-old Huang Yu Xua...,https://i.mydramalist.com/1L7lyc.jpg?v=1,2019,,,,,False
136,liar game 2,Drama,Japan,9.0,2010,Fuji TV,54 min.,11040,Ohki Ayako,Kuroiwa Tsutomu,...,"Thriller,Psychological,Drama","Moral Dilemma,Betrayal,Debt,Smart Male Lead,G...",The naive Kanzaki Nao and expert swindler Akiy...,https://i.mydramalist.com/0wXZ4c.jpg?v=1,2009,,,,,False
150,pinocchio,Drama,South Korea,20.0,2015,SBS,58 min.,107068,Jo Soo Won,Park Hye Ryun,...,"Thriller,Mystery,Comedy,Romance","News Reporter,Tragic Past,Hidden Identity,Dou...",The idealistic Choi In Ha has her work cut out...,https://i.mydramalist.com/1zDn5_4c.jpg?v=1,2014,,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4638,wu xin: the monster killer 3,Drama,China,28.0,2020,iQiyi Sohu TV,40 min.,801,"Patrick Yau, Wai Hong Chui","Li Nan, Xu Zi Yuan",...,"Historical,Romance,Fantasy","Demon/Monster Hunting,Supernatural,Adventure,...",An immortal Taoist Master with no heart seeks ...,https://i.mydramalist.com/qoL2Pc.jpg?v=1,2019,,,,,False
4641,above the clouds: season 2,Drama,China,25.0,2017,iQiyi,45 min.,224,"Chan Ka Lam, Li Wen Long, Deng Wei En",Yu Zheng,...,"Action,Historical,Romance,Life,Melodrama",,,https://i.mydramalist.com/BpARRc.jpg?v=1,2016,,,,,False
4642,keishicho zero gakari: third season,Drama,Japan,7.0,2018,TV Tokyo,46 min.,90,"Takemura Kentaro, Okano Hironobu, Kuranuki K...","Tokuo Koji, Funabashi Susumu, Masaike Yosuke...",...,"Mystery,Comedy","Police Department,Policeman,Eccentric Male Le...","The ""anything"" consultation office of Suginami...",https://i.mydramalist.com/BoJp7_4c.jpg?v=1,2017,,,,,False
4644,here comes fortune star,Drama,Taiwan,60.0,2021,LINE TV SET TV Vidol,60 min.,197,Chiang Ping Chen,,...,"Adventure,Romance,Life,Fantasy","Mortal World,Unfortunate Female Lead,Deity,Re...",God of wealth Zhao Zimo was demoted to the mor...,https://i.mydramalist.com/RgJ0E_4c.jpg?v=1,2020,,,,,False


In [None]:
non_match_df

In [237]:
# lets see if there are duplicates 
duplicate_columns = ['Name', 'category', 'country', 'num_episodes', 'aired']
duplicates = third_final_df.duplicated(subset=duplicate_columns, keep=False)

# Create a DataFrame with all duplicates
duplicate_df = third_final_df[duplicates]
duplicate_df

Unnamed: 0,Name,category,country,num_episodes,aired,orginal_network,duration,watchers,director,screenwriter,...,genre_names,tag_names,synopsis,url,adjusted_year,year,kind,title,sid,merged
95,the guest,Drama,South Korea,16.0,2018,OCN,1 hr. 7 min.,24584,Kim Hong Seon,"Seo Jae Won, Kwon So Ra",...,"Thriller,Mystery,Horror,Supernatural","Exorcism,Death,Demon,Possessed,Shamanism,Murd...","Yoon Hwa Pyung, a young psychic born into a sh...",https://i.mydramalist.com/Wb2pWc.jpg?v=1,2017,2018.0,Drama,the guest,21424.0,True
97,cross fire,Drama,China,36.0,2020,Tencent Video,45 min.,7336,Derek Hui,Xu Su,...,"Action,Youth,Sci-Fi,Sports","Gaming,Time Altering,Disability,Bromance,Adap...","In 2008, Xiao Feng is a downcast e-sports team...",https://i.mydramalist.com/v6qPe_4c.jpg?v=1,2019,2020.0,Drama,cross fire,25404.0,True
109,once again,Drama,South Korea,100.0,2020,KBS2,35 min.,9287,Lee Jae Sang,Yang Hee Seung,...,"Comedy,Romance,Drama,Family","Divorce,Family Relationship,Multiple Couples,...","""Once Again"" is the story of the eventful Song...",https://i.mydramalist.com/kWPprc.jpg?v=1,2019,2020.0,Drama,once again,28594.0,True
110,once again,Drama,South Korea,100.0,2020,KBS2,35 min.,9287,Lee Jae Sang,Yang Hee Seung,...,"Comedy,Romance,Drama,Family","Divorce,Family Relationship,Multiple Couples,...","""Once Again"" is the story of the eventful Song...",https://i.mydramalist.com/kWPprc.jpg?v=1,2019,2020.0,Drama,once again,37983.0,True
123,descendants of the sun,Drama,South Korea,16.0,2016,KBS2,60 min.,133252,"Baek Sang Hoon, Lee Eung Bok","Kim Eun Sook, Kim Won Seok",...,"Action,Comedy,Romance,Melodrama","Soldier,Hardworking Female Lead,Multiple Coup...",A love story that develops between a surgeon a...,https://i.mydramalist.com/vN26Zc.jpg?v=1,2015,2016.0,Drama,descendants of the sun,9854.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5366,doctor chousahan,Drama,Japan,7.0,2016,TV Tokyo,54 min.,87,Endo Mitsutaka,Hayashi Makoto,...,"Mystery,Drama,Medical",Investigation,"It is estimated that between 1,300 to 2,000 pe...",https://i.mydramalist.com/E8717c.jpg?v=1,2015,2016.0,Drama,doctor chousahan,14171.0,True
5367,meikenchiku de chushoku wo,Drama,Japan,10.0,2020,Osaka Channel TV Tokyo,30 min.,334,Yoshimi Takuma,,...,"Food,Life,Drama,Documentary","Architecture,Adapted From A Book,Unusual Frie...","Uekusa Chiaki, an architectural modeler whose ...",https://i.mydramalist.com/BYkq5_4c.jpg?v=1,2019,2020.0,Drama,meikenchiku de chushoku wo,34166.0,True
5368,my mother is a daughter-in-law,Drama,South Korea,136.0,2015,SBS,40 min.,216,Ko Heung Sik,Lee Geun Young,...,"Comedy,Drama,Family,Melodrama",,Gyeong Sook married a wealthy older man when s...,https://i.mydramalist.com/0BkjNc.jpg?v=1,2014,2015.0,Drama,my mother is a daughter-in-law,12482.0,True
5369,woon ruk nakkao,Drama,Thailand,26.0,2020,PPTV,60 min.,155,,,...,"Comedy,Romance,Drama","Identical Twins,Hardworking Male Lead,Hardwor...",A romantic comedy about two competing news rep...,https://i.mydramalist.com/R4n4R_4c.jpg?v=1,2019,2020.0,Drama,woon ruk nakkao,24796.0,True


# adding/editing name data
### some dramas had small differences in the grammar of their names, manually updated some to assist in successful merging 

In [135]:
shows[shows['title'].str.contains('descendants')].head(5)

Unnamed: 0,country,year,kind,title,sid
53,South Korea,2016,Drama,descendants of the sun,9854
10427,Philippines,2020,Drama,descendants of the sun,30958
25394,South Korea,2016,Drama,descendants of the airport,15461
27558,China,2022,Drama,rise of the descendants,45086


In [136]:
df[df['Name'].str.contains('descendants')]

Unnamed: 0,Name,category,country,num_episodes,aired,orginal_network,duration,watchers,director,screenwriter,rating,num_raters,cast_names,genre_names,tag_names,synopsis,url,adjusted_year
149,descendants of the sun,Drama,South Korea,16.0,2016,KBS2,60 min.,133252,"Baek Sang Hoon, Lee Eung Bok","Kim Eun Sook, Kim Won Seok",8.7,69898,"Song Joong Ki,Song Hye Kyo,Jin Goo,Kim Ji Won,...","Action,Comedy,Romance,Melodrama","Soldier,Hardworking Female Lead,Multiple Coup...",A love story that develops between a surgeon a...,https://i.mydramalist.com/vN26Zc.jpg?v=1,2015


In [138]:
df.at[149,'Name']="descendants of the sun"

In [139]:
df.at[149,'Name']

'descendants of the sun'

In [101]:
shows.at[2788, 'title']

'someday or one day'

In [91]:
#df.at[137,'aired']=2016