In [26]:
import pandas as pd
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime

### Relevant Data Set can be found on : https://data.world/raghav333/cricket-players-espn

## Reading Data

#### PySpark

In [27]:
spark=SparkSession.builder.appName('temp').getOrCreate()

#### Pandas

In [28]:
main=pd.read_csv("cricket_data.csv")

#### Converting Pandas Dtype Object to String Dtype in order to use the same dateframe as Spark DataFrame

In [29]:
cols_dtypes=main.dtypes.values.tolist()
columns=main.columns.values.tolist()
for index,_ in enumerate(cols_dtypes):
    if str(cols_dtypes[index])=="object":
        main[columns[index]]=main[columns[index]].astype(str)

In [30]:
main_spark=spark.createDataFrame(main.head(100))

In [31]:
main_spark.printSchema()

root
 |-- ID: long (nullable = true)
 |-- NAME: string (nullable = true)
 |-- COUNTRY: string (nullable = true)
 |-- Full name: string (nullable = true)
 |-- Born: string (nullable = true)
 |-- Died: string (nullable = true)
 |-- Current age: string (nullable = true)
 |-- Major teams: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- Height: string (nullable = true)
 |-- Nickname: double (nullable = true)
 |-- Playing role: string (nullable = true)
 |-- Batting style: string (nullable = true)
 |-- Bowling style: string (nullable = true)
 |-- Other: string (nullable = true)
 |-- Relation: string (nullable = true)
 |-- In a nutshell: string (nullable = true)
 |-- DESCRIPTION: string (nullable = true)
 |-- AWARDS: string (nullable = true)
 |-- BATTING_Tests_Mat: double (nullable = true)
 |-- BATTING_Tests_Inns: string (nullable = true)
 |-- BATTING_Tests_NO: string (nullable = true)
 |-- BATTING_Tests_Runs: string (nullable = true)
 |-- BATTING_Tests_HS: string (nulla

In [105]:
main.head()

Unnamed: 0,ID,NAME,COUNTRY,Full name,Born,Died,Current age,Major teams,Education,Height,...,BOWLING_T20s_Runs,BOWLING_T20s_Wkts,BOWLING_T20s_BBI,BOWLING_T20s_BBM,BOWLING_T20s_Ave,BOWLING_T20s_Econ,BOWLING_T20s_SR,BOWLING_T20s_4w,BOWLING_T20s_5w,BOWLING_T20s_10
0,8772,Henry Arkell,England,Henry John Denham Arkell,"\nJune 26, 1898, Edmonton, Middlesex","March 12, 1982, Oxford (aged 83 years 259 days)",,Northamptonshire,,,...,,,,,,,,,,
1,532565,Richard Nyren,England,Richard Nyren,"\nApril 25, 1734, Eartham, Sussex","April 25, 1797, Lee or Leigh, Kent (aged 63 ye...",,Hampshire XI,,,...,,,,,,,,,,
2,16856,Sydney Maartensz,England,Sydney Gratien Adair Maartensz,"\nApril 14, 1882, Colombo, Ceylon","September 10, 1967, Pyrford, Woking, Surrey (a...",,Hampshire,,,...,,,,,,,,,,
3,16715,Brian Lander,England,Brian Richard Lander,"\nJanuary 9, 1942, Bishop Auckland, Co Durham",,77 years 73 days,"['Durham,', 'Minor Counties']",,,...,,,,,,,,,,
4,15989,Derek Kenderdine,England,Derek Charles Kenderdine,"\nOctober 28, 1897, Chislehurst, Kent","August 28, 1947, Cambridge (aged 49 years 304 ...",,Royal Navy,,,...,,,,,,,,,,


In [None]:
main.columns.tolist()

['ID',
 'NAME',
 'COUNTRY',
 'Full name',
 'Born',
 'Died',
 'Current age',
 'Major teams',
 'Education',
 'Height',
 'Nickname',
 'Playing role',
 'Batting style',
 'Bowling style',
 'Other',
 'Relation',
 'In a nutshell',
 'DESCRIPTION',
 'AWARDS',
 'BATTING_Tests_Mat',
 'BATTING_Tests_Inns',
 'BATTING_Tests_NO',
 'BATTING_Tests_Runs',
 'BATTING_Tests_HS',
 'BATTING_Tests_Ave',
 'BATTING_Tests_BF',
 'BATTING_Tests_SR',
 'BATTING_Tests_100',
 'BATTING_Tests_50',
 'BATTING_Tests_4s',
 'BATTING_Tests_6s',
 'BATTING_Tests_Ct',
 'BATTING_Tests_St',
 'BATTING_ODIs_Mat',
 'BATTING_ODIs_Inns',
 'BATTING_ODIs_NO',
 'BATTING_ODIs_Runs',
 'BATTING_ODIs_HS',
 'BATTING_ODIs_Ave',
 'BATTING_ODIs_BF',
 'BATTING_ODIs_SR',
 'BATTING_ODIs_100',
 'BATTING_ODIs_50',
 'BATTING_ODIs_4s',
 'BATTING_ODIs_6s',
 'BATTING_ODIs_Ct',
 'BATTING_ODIs_St',
 'BATTING_T20Is_Mat',
 'BATTING_T20Is_Inns',
 'BATTING_T20Is_NO',
 'BATTING_T20Is_Runs',
 'BATTING_T20Is_HS',
 'BATTING_T20Is_Ave',
 'BATTING_T20Is_BF',
 'BATTIN

In [None]:
main['In a nutshell'].isnull().sum()

0

## Dropping those columns which have a high propotion of NULL Values

#### Pyspark

In [None]:
null_instances=main.isnull().sum()

In [None]:
null_instances_spark=main_spark.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in main_spark.columns])

In [None]:
null_instances_spark.show(1)

+---+----+-------+---------+----+-----+-----------+-----------+---------+------+--------+------------+-------------+-------------+-----+--------+-------------+-----------+------+-----------------+------------------+----------------+------------------+----------------+-----------------+----------------+----------------+-----------------+----------------+----------------+----------------+----------------+----------------+----------------+-----------------+---------------+-----------------+---------------+----------------+---------------+---------------+----------------+---------------+---------------+---------------+---------------+---------------+-----------------+------------------+----------------+------------------+----------------+-----------------+----------------+----------------+-----------------+----------------+----------------+----------------+----------------+----------------+-----------------------+------------------------+----------------------+------------------------+----

In [None]:
null_instances_spark_rdd=null_instances_spark.rdd

In [None]:
null_instances_spark_rdd.collect()

[Row(ID=0, NAME=2, COUNTRY=0, Full name=2, Born=0, Died=73208, Current age=41892, Major teams=6271, Education=84010, Height=88536, Nickname=90308, Playing role=86590, Batting style=33056, Bowling style=46223, Other=83664, Relation=85608, In a nutshell=90283, DESCRIPTION=0, AWARDS=0, BATTING_Tests_Mat=86656, BATTING_Tests_Inns=86656, BATTING_Tests_NO=86656, BATTING_Tests_Runs=86656, BATTING_Tests_HS=86656, BATTING_Tests_Ave=86656, BATTING_Tests_BF=88625, BATTING_Tests_SR=88625, BATTING_Tests_100=86656, BATTING_Tests_50=86656, BATTING_Tests_4s=88299, BATTING_Tests_6s=87110, BATTING_Tests_Ct=86656, BATTING_Tests_St=86656, BATTING_ODIs_Mat=86652, BATTING_ODIs_Inns=86652, BATTING_ODIs_NO=86652, BATTING_ODIs_Runs=86652, BATTING_ODIs_HS=86652, BATTING_ODIs_Ave=86652, BATTING_ODIs_BF=87240, BATTING_ODIs_SR=87240, BATTING_ODIs_100=86652, BATTING_ODIs_50=86652, BATTING_ODIs_4s=87421, BATTING_ODIs_6s=87400, BATTING_ODIs_Ct=86652, BATTING_ODIs_St=86652, BATTING_T20Is_Mat=88157, BATTING_T20Is_Inns=

In [None]:
def col_func(x):
    list_=[]
    for col in list_columns:
        propotion_null=x[col]/count_rows
        if propotion_null<0.9:
            list_.append(col)
    return list_
        

In [None]:
list_columns=main_spark.columns
count_rows=main_spark.count()

In [None]:
count_rows

90308

In [None]:
columns_needed=null_instances_spark_rdd.map(col_func).collect()

In [None]:
null_instances=null_instances.map(lambda x:x/main.shape[0])

In [None]:
null_instances[null_instances>0.90].index

Index(['Nickname', 'BATTING_Tests_Mat', 'BATTING_ODIs_Mat',
       'BATTING_T20Is_Mat', 'BATTING_T20s_Mat', 'BOWLING_Tests_Mat',
       'BOWLING_ODIs_Mat', 'BOWLING_T20Is_Mat', 'BOWLING_T20s_Mat'],
      dtype='object')

In [None]:
main_spark=main_spark.drop('Education', 'Height', 'Nickname', 'Playing role', 'Other', 'Relation',
       'In a nutshell')

#### Pandas

In [None]:
main.drop(['Education', 'Height', 'Nickname', 'Playing role', 'Other', 'Relation',
       'In a nutshell'],1,inplace=True)

In [None]:
main["Born"]=main["Born"].str.replace("\n","")

## Forming the Date Birth Column which have Year+Date the player was born

#### Pyspark

In [32]:
split_col = split(main_spark['Born'], ',')

In [33]:
main_spark = main_spark.withColumn('NAME1', split_col.getItem(0))
main_spark = main_spark.withColumn('NAME2', split_col.getItem(1))

In [34]:
main_spark.select("NAME1","NAME2").show()

+---------------+--------+
|          NAME1|   NAME2|
+---------------+--------+
|      \nJune 26|    1898|
|     \nApril 25|    1734|
|     \nApril 14|    1882|
|    \nJanuary 9|    1942|
|   \nOctober 28|    1897|
|     \nMarch 24|    1979|
|      \nJuly 17|   1999 |
|   \nDecember 6|    1993|
|  \nFebruary 26|    1929|
|   \nJanuary 21|   2003 |
|\ndate unknown |    null|
|\ndate unknown |    null|
|     \nApril 23|    1986|
|    \nJanuary 8|    1999|
|\ndate unknown |    null|
|\ndate unknown |    null|
|     \nMarch 27|    1991|
|    \nOctober 1|    1987|
|         \n1947| Mombasa|
|        \n1975 |    null|
+---------------+--------+
only showing top 20 rows



In [35]:
main_spark=main_spark.na.fill('',["NAME2"])

In [36]:
def fill_Date_birth_spark(x):
    name_2=x["NAME2"]
    name_1=x['NAME1']
    
    name_2=name_2.strip()
    name_1=name_1.strip()
    
    if not name_2.isnumeric():
        return name_1
    else:
        return name_1+"-"+name_2
   

In [37]:
main_spark_rdd=main_spark.rdd

In [38]:
main_spark_rdd_names_join=main_spark_rdd.map(fill_Date_birth_spark)

In [39]:
list_rdd_born=main_spark_rdd_names_join.collect()

In [40]:
main_spark = main_spark.repartition(1).withColumn(
    "Date_Birth", 
    udf(lambda id: list_rdd_born[id])(monotonically_increasing_id()))

In [41]:
main_spark=main_spark.drop("NAME1","NAME2")

In [42]:
main_spark.select("Date_Birth").show()

+----------------+
|      Date_Birth|
+----------------+
|    June 26-1898|
|   April 25-1734|
|   April 14-1882|
|  January 9-1942|
| October 28-1897|
|   March 24-1979|
|    July 17-1999|
| December 6-1993|
|February 26-1929|
| January 21-2003|
|    date unknown|
|    date unknown|
|   April 23-1986|
|  January 8-1999|
|    date unknown|
|    date unknown|
|   March 27-1991|
|  October 1-1987|
|            1947|
|            1975|
+----------------+
only showing top 20 rows



#### Pandas

In [4]:
date_splitted=main["Born"].str.split(",",expand=True)
date_splitted.iloc[:,1]=date_splitted.iloc[:,1].fillna('')

In [5]:
def fill_date_birth(x):
    x[1]=x[1].strip()
    if not x[1].isnumeric():
        return x[0]
    else:
        return x[0]+"-"+x[1]

main["Date_Birth"]=date_splitted.apply(fill_date_birth,axis=1)

### What to do with players whose date is not known

#### PySpark

In [43]:
main_spark = main_spark.withColumn('Date_Birth', trim(col('Date_Birth')))

In [44]:
mode_date_birth_spark=main_spark.filter(~main_spark.Date_Birth.contains('unknown')).groupby('Date_Birth').count().orderBy('count',ascending=False).first()[0]

In [45]:
main_spark=main_spark.withColumn('Date_Birth', when(main_spark.Date_Birth.rlike('known|Known|circa|Details|Hong|Cheema|circ|Pakistan|Goa|Kamrup|date unknown'),mode_date_birth_spark).otherwise(
    main_spark['Date_Birth']
))


In [46]:
main_spark=main_spark.withColumn('Date_Birth', regexp_replace('Date_Birth', ' ', '/')) 
main_spark=main_spark.withColumn('Date_Birth', regexp_replace('Date_Birth', '-', '/')) 
main_spark.select('Date_Birth').show()

+----------------+
|      Date_Birth|
+----------------+
|    June/26/1898|
|   April/25/1734|
|   April/14/1882|
|  January/9/1942|
| October/28/1897|
|   March/24/1979|
|    July/17/1999|
| December/6/1993|
|February/26/1929|
| January/21/2003|
|February/24/1975|
|February/24/1975|
|   April/23/1986|
|  January/8/1999|
|February/24/1975|
|February/24/1975|
|   March/27/1991|
|  October/1/1987|
|            1947|
|            1975|
+----------------+
only showing top 20 rows



In [47]:
def date_birth_func_spark(x):
    month_to_numeric={
        "January":"01",
        "February":"02",
        "March":"03",
        "April":"04",
        "May":"05",
        "June":"06",
        "July":"07",
        "August":"08",
        "September":"09",
        "October":"10",
        "November":"11",
        "December":"12",
        "Q1":"1" 
    }
    
    date_instance=x.split("/")
    if '' in date_instance:
        date_instance[date_instance.index('')]='1994'
    if '0000' in date_instance:
        date_instance[date_instance.index('0000')]='1994'
    if len(date_instance)==1:
        return datetime.strptime("01/01/"+date_instance[0],'%m/%d/%Y')
    else:
        if date_instance[0] in month_to_numeric.keys():
            date_instance[0]=month_to_numeric[date_instance[0]]
        return datetime.strptime("/".join(date_instance),'%m/%d/%Y')

In [49]:
udf_function = udf(lambda x: date_birth_func_spark(x),DateType())
main_spark=main_spark.withColumn('Date_Birth',udf_function(col('Date_Birth')))

#### Pandas

In [6]:
main["Date_Birth"]=main["Date_Birth"].str.strip()

In [7]:
mode_date_birth=main.loc[~(main["Date_Birth"].str.contains('unknown')),'Date_Birth'].mode().values.tolist()[0]

In [8]:
main.loc[(main["Date_Birth"].str.contains('known|Known|circa|Details|Hong|Cheema|circ|Pakistan|Goa|Kamrup|Mansa|Matale|Badulla|Balapitiya|Kiribathgoda|Colombo|Westminster|Polonnaruwa')),'Date_Birth']="January 1-"+mode_date_birth

In [9]:
main["Date_Birth"]=main["Date_Birth"].str.replace(" ","/")
main["Date_Birth"]=main["Date_Birth"].str.replace("-","/")

In [10]:
final_list=[]
def date_birth_func(x):
    month_to_numeric={
        "January":"01",
        "February":"02",
        "March":"03",
        "April":"04",
        "May":"05",
        "June":"06",
        "July":"07",
        "August":"08",
        "September":"09",
        "October":"10",
        "November":"11",
        "December":"12",
        "Q1":"1" 
    }
    if x=='':
        return datetime.strptime("01/01/"+mode_date_birth, '%m/%d/%Y')
    date_instance=x.split("/")
    try:
        if '' in date_instance:
            date_instance[date_instance.index('')]=mode_date_birth
            return datetime.strptime("/".join(date_instance), '%m/%d/%Y')
        if '0000' in date_instance:
            date_instance[date_instance.index('0000')]=mode_date_birth
        if len(date_instance)==1:
            return datetime.strptime("01/01/"+date_instance[0], '%m/%d/%Y')
        elif len(date_instance)==2:
            date_instance[0]=month_to_numeric[date_instance[0]]
            return datetime.strptime(date_instance[0]+"/01/"+date_instance[1], '%m/%d/%Y')
        else:
            if date_instance[0] in month_to_numeric.keys():
                date_instance[0]=month_to_numeric[date_instance[0]]
            return datetime.strptime("/".join(date_instance), '%m/%d/%Y')
    except:
        final_list.append(x)
   
        
    
    
   
    
    

In [11]:
main["Born"]=main["Date_Birth"].map(date_birth_func)

In [12]:
main.drop('Date_Birth',axis=1,inplace=True)

## Forming the Date Death Column which have Year+Date the player was born

#### Pyspark

In [51]:
split_col = split(main_spark['Died'], ',')

In [52]:
main_spark = main_spark.withColumn('NAME1', split_col.getItem(0))
main_spark = main_spark.withColumn('NAME2', split_col.getItem(1))

In [53]:
main_spark=main_spark.fillna('',subset=['NAME2'])

In [54]:
custom_func_name1_name2=udf(lambda x,y:x+"-"+y)

In [55]:
main_spark=main_spark.withColumn('Died',custom_func_name1_name2(col('NAME1'),col('NAME2')))

In [56]:
main_spark=main_spark.drop('NAME1','NAME2')

In [57]:
mode_date_died_spark=main_spark.filter(~main_spark.Died.rlike('nan|unknown|Presumed Dead|Known|Unknown')).groupby('Died').count().orderBy('count',ascending=False).first()[0]

In [58]:
mode_date_died_spark

'September 10- 1967'

In [59]:
global mode_year_spark
mode_year_spark=mode_date_died_spark.split('- ')[-1]

In [60]:
main_spark=main_spark.withColumn('Died', when(main_spark.Died.rlike('unknown|Presumed Dead|Known|Unknown|0000|aged|Duncton|Leicester|Tasmania'),mode_date_died_spark).otherwise(
    main_spark['Died']
))

In [61]:
main_spark.select('Died').show()

+------------------+
|              Died|
+------------------+
|    March 12- 1982|
|    April 25- 1797|
|September 10- 1967|
|              nan-|
|   August 28- 1947|
|              nan-|
|              nan-|
|              nan-|
|              nan-|
|              nan-|
|              nan-|
|              nan-|
|              nan-|
|              nan-|
|              nan-|
|              nan-|
|              nan-|
|              nan-|
|              nan-|
|              nan-|
+------------------+
only showing top 20 rows



In [62]:
main_spark=main_spark.withColumn('Died', regexp_replace('Died', '- ', '/')) 
main_spark=main_spark.withColumn('Died', regexp_replace('Died', ' ', '/')) 
main_spark=main_spark.withColumn('Died', regexp_replace('Died', 'nan/', 'nan')) 


In [63]:
def date_death_func_spark(x):
    month_to_numeric={
        "January":"01",
        "February":"02",
        "March":"03",
        "April":"04",
        "May":"05",
        "June":"06",
        "July":"07",
        "August":"08",
        "September":"09",
        "October":"10",
        "November":"11",
        "December":"12",
        "Q1":"1" ,
        'Q2':'4'
    }  
   
    if x=='nan':
        return 'nan'
    date_list=x.split('/')[:3]
    all_numeric=True
    index_to_pop=[]
    for index,instance in enumerate(date_list):
        if not instance.isnumeric() and instance not in month_to_numeric.keys():
            all_numeric=False
        else:    
            index_to_pop.append(index)
    if all_numeric:
        date_list[0]=month_to_numeric[date_list[0]]
        return datetime.strptime('/'.join(date_list),'%m/%d/%Y')
        
    else:

        new_date_list=[]
        for index in index_to_pop:
            new_date_list.append(date_list[index])
        if len(new_date_list)==1:
            return datetime.strptime('01/'+'01/'+new_date_list[0],'%m/%d/%Y')
        elif len(new_date_list)==2 and new_date_list[0] in month_to_numeric.keys():
            new_date_list[0]=month_to_numeric[new_date_list[0]]
            return datetime.strptime(new_date_list[0]+'/01/'+new_date_list[1],'%m/%d/%Y')
        else:
            return datetime.strptime('01/'+'01/'+mode_year_spark,'%m/%d/%Y')
                
    
        
            
    
      

In [64]:
death_udf=udf(lambda x:date_death_func_spark(x),DateType())
main_spark=main_spark.withColumn('Died',death_udf(col('Died')))

#### Pandas

In [13]:
main['Died']=main['Died'].map(lambda x:"-".join(x.split(',')[:2]))

In [14]:
mode_died=main.loc[~(main['Died'].str.contains('nan|unknown|Presumed Dead|Known|Unknown')),'Died'].mode().values.tolist()[0]

In [15]:
mode_year=mode_died.split('- ')[-1]

In [16]:
main.loc[(main['Died'].str.contains('unknown|Presumed Dead|Known|Unknown|0000|aged|Duncton|Leicester|Tasmania')),'Died']=mode_died

In [17]:
main['Died']=main['Died'].str.replace('- ','/').str.replace(' ','/')

In [18]:
def date_death_func(x):
    month_to_numeric={
        "January":"01",
        "February":"02",
        "March":"03",
        "April":"04",
        "May":"05",
        "June":"06",
        "July":"07",
        "August":"08",
        "September":"09",
        "October":"10",
        "November":"11",
        "December":"12",
        "Q1":"1" ,
        'Q2':'4'
    }  
   
    if x=='nan':
        return 'nan'
    date_list=x.split('/')[:3]
    all_numeric=True
    index_to_pop=[]
    for index,instance in enumerate(date_list):
        if not instance.isnumeric() and instance not in month_to_numeric.keys():
            all_numeric=False
        else:    
            index_to_pop.append(index)
    if all_numeric:
        date_list[0]=month_to_numeric[date_list[0]]
        return datetime.strptime('/'.join(date_list), '%m/%d/%Y')
        
    else:

        new_date_list=[]
        for index in index_to_pop:
            new_date_list.append(date_list[index])
        if len(new_date_list)==1:
            return datetime.strptime('01/'+'01/'+new_date_list[0], '%m/%d/%Y')
        elif len(new_date_list)==2 and new_date_list[0] in month_to_numeric.keys():
            new_date_list[0]=month_to_numeric[new_date_list[0]]
            return datetime.strptime(new_date_list[0]+'/01/'+new_date_list[1], '%m/%d/%Y')
        else:
            return datetime.strptime('01/'+'01/'+mode_year, '%m/%d/%Y')
                
    
        
            
    
      

In [19]:
main['Died']=main['Died'].map(date_death_func)

## Forming the Age Column
    -  Age=Death-Birth (IF died)
    -  Age=Today()- Birth (IF Alive)

#### PySpark

In [66]:
main_spark=main_spark.withColumn('Age',col('Current age'))

In [67]:
main_spark=main_spark.withColumnRenamed("Current age","Current_age")

In [71]:
case_1_func=udf(lambda x:(datetime.now()-x).days/365)

In [73]:
main_spark=main_spark.withColumn("Age",when(~main_spark.Age.contains('nan'),case_1_func(col('Date_Birth'))).otherwise(main_spark['Age']))

In [74]:
Case_2_func=udf(lambda x,y :(x-y).days/365)

In [80]:
main_spark=main_spark.withColumn("Age",when(main_spark.Age.contains('nan') & ~main_spark.Died.isNull() ,Case_2_func(col('Died'),col('Date_Birth'))).otherwise(main_spark['Age']))

+------------------+
|               Age|
+------------------+
| 83.76438356164384|
| 63.04383561643836|
| 85.46301369863014|
| 80.22191780821917|
|49.863013698630134|
| 42.99452054794521|
|22.665753424657535|
|28.279452054794522|
|  93.0986301369863|
|19.147945205479452|
|-8.153424657534247|
|-8.153424657534247|
| 35.90684931506849|
|23.186301369863013|
|-8.153424657534247|
|-8.153424657534247|
| 30.97808219178082|
|34.465753424657535|
| 75.24109589041096|
|47.221917808219175|
+------------------+
only showing top 20 rows



In [None]:
### Implement Case 3,4,5,6, From Pandas

#### Pandas

In [37]:
main['Age']=main['Current age']

In [38]:
main.loc[(main['Current age']!='nan'),'Age']=main.loc[(main['Current age']!='nan'),:].apply(lambda x:(datetime.today()-x['Born']).days/365,axis=1)#### Case-1 Those who are still alive, what is their updated age?

In [40]:
main.loc[(main['Current age']=='nan') & ~(main['Died'].isnull()) ,'Age']=main.loc[(main['Current age']=='nan') & ~(main['Died'].isnull()) ,:].apply(lambda x:(x['Died']-x['Born']).days/365,axis=1)### Case-2 Those who have died and their dead date is known

In [41]:
main.loc[(main['Current age']=='nan') & (main['Died'].isnull()) ,'Age']=main.loc[(main['Current age']=='nan') & (main['Died'].isnull()) ,:].apply(lambda x:(datetime.today()-x['Born']).days/365,axis=1)## Case Those who are alive but their current age is nan

In [49]:
main.loc[(main['Age']<0),'Died']=None #### This is happening because some of died values were imputed from the mode which might have the cause the died column to be less than Birth therefore making them null makes sense

In [52]:
main.loc[(main['Age']<0)&(main['Died']).isnull(),'Age']=main.loc[(main['Age']<0)&(main['Died']).isnull(),:].apply(lambda x:(datetime.today()-x['Born']).days/365,axis=1)##### Those whose age is negative due to death mode imputation--assumption is that they are still alive.

In [57]:
main.loc[(main['Age']>150),'Age']=150 ### No one can live that long :D 

In [60]:
main.drop('Current age',axis=1,inplace=True)

In [61]:
main

Unnamed: 0,ID,NAME,COUNTRY,Full name,Born,Died,Major teams,Education,Height,Nickname,...,BOWLING_T20s_Wkts,BOWLING_T20s_BBI,BOWLING_T20s_BBM,BOWLING_T20s_Ave,BOWLING_T20s_Econ,BOWLING_T20s_SR,BOWLING_T20s_4w,BOWLING_T20s_5w,BOWLING_T20s_10,Age
0,8772,Henry Arkell,England,Henry John Denham Arkell,1898-06-26,1982-03-12,Northamptonshire,,,,...,,,,,,,,,,83.7644
1,532565,Richard Nyren,England,Richard Nyren,1734-04-25,1797-04-25,Hampshire XI,,,,...,,,,,,,,,,63.0438
2,16856,Sydney Maartensz,England,Sydney Gratien Adair Maartensz,1882-04-14,1967-09-10,Hampshire,,,,...,,,,,,,,,,85.463
3,16715,Brian Lander,England,Brian Richard Lander,1942-01-09,NaT,"['Durham,', 'Minor Counties']",,,,...,,,,,,,,,,80.2137
4,15989,Derek Kenderdine,England,Derek Charles Kenderdine,1897-10-28,1947-08-28,Royal Navy,,,,...,,,,,,,,,,49.863
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90303,19777,Keith Robinson,England,Keith Robinson,1933-12-17,NaT,Combined Services,,,,...,,,,,,,,,,88.2822
90304,14843,Trevitt Hine-Haycock,England,Trevitt Reginald Hine-Haycock,1861-12-03,1953-11-02,"['Kent,', 'Oxford University']",,,,...,,,,,,,,,,91.9753
90305,15025,John Hughes,England,John Hughes,1825-07-02,1907-01-29,South of England,,,,...,,,,,,,,,,81.6301
90306,11167,John Clayton,England,John Morton Clayton,1857-11-17,1938-04-01,Derbyshire,,,,...,,,,,,,,,,80.4219
