In [1]:
# Dependencies
import pandas as pd
import re

# Load the CSV file into a DataFrame
df = pd.read_csv('Data/Characters.csv', sep=';')

# Display the first few rows to verify the data
df.head()

Unnamed: 0,Id,Name,Gender,Job,House,Wand,Patronus,Species,Blood status,Hair colour,Eye colour,Loyalty,Skills,Birth,Death
0,1,Harry James Potter,Male,Student,Gryffindor,"11"" Holly phoenix feather",Stag,Human,Half-blood,Black,Bright green,Albus Dumbledore | Dumbledore's Army | Order o...,Parseltongue| Defence Against the Dark Arts | ...,31 July 1980,
1,2,Ronald Bilius Weasley,Male,Student,Gryffindor,"12"" Ash unicorn tail hair",Jack Russell terrier,Human,Pure-blood,Red,Blue,Dumbledore's Army | Order of the Phoenix | Hog...,Wizard chess | Quidditch goalkeeping,1 March 1980,
2,3,Hermione Jean Granger,Female,Student,Gryffindor,"10¾"" vine wood dragon heartstring",Otter,Human,Muggle-born,Brown,Brown,Dumbledore's Army | Order of the Phoenix | Hog...,Almost everything,"19 September, 1979",
3,4,Albus Percival Wulfric Brian Dumbledore,Male,Headmaster,Gryffindor,"15"" Elder Thestral tail hair core",Phoenix,Human,Half-blood,Silver| formerly auburn,Blue,Dumbledore's Army | Order of the Phoenix | Hog...,Considered by many to be one of the most power...,Late August 1881,"30 June, 1997"
4,5,Rubeus Hagrid,Male,Keeper of Keys and Grounds | Professor of Care...,Gryffindor,"16"" Oak unknown core",,Half-Human/Half-Giant,Part-Human (Half-giant),Black,Black,Albus Dumbledore | Order of the Phoenix | Hogw...,Resistant to stunning spells| above average st...,6 December 1928,


In [2]:
# Extract the 'Birth' column
birth_dates = df['Birth']

# Display the birth dates
print(birth_dates)

0            31 July 1980
1            1 March 1980
2      19 September, 1979
3        Late August 1881
4         6 December 1928
              ...        
135                   NaN
136              Pre 1945
137                  1883
138               28 June
139                   NaN
Name: Birth, Length: 140, dtype: object


In [3]:
def extract_month_year(date_str):
    # Check if the input is a valid string
    if not isinstance(date_str, str):
        return pd.Series([None, None])  # Return None for both month and year if it's not a string
    
    # Regular expression to find month and year
    month_pattern = r'(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)'
    year_pattern = r'(\d{4})'
    
    # Extract month
    month_match = re.findall(month_pattern, date_str)
    month = month_match[0] if month_match else None
    
    # Extract year
    year_match = re.findall(year_pattern, date_str)
    year = year_match[-1] if year_match else None  # Get the last match for year
    
    return pd.Series([month, year])

# Apply the function to the Birth column
df[['Month', 'Year']] = df['Birth'].apply(extract_month_year)
df['Month_Year'] = df['Month'] + ' ' + df['Year'].astype(str)

# Display the cleaned DataFrame
df.head()

Unnamed: 0,Id,Name,Gender,Job,House,Wand,Patronus,Species,Blood status,Hair colour,Eye colour,Loyalty,Skills,Birth,Death,Month,Year,Month_Year
0,1,Harry James Potter,Male,Student,Gryffindor,"11"" Holly phoenix feather",Stag,Human,Half-blood,Black,Bright green,Albus Dumbledore | Dumbledore's Army | Order o...,Parseltongue| Defence Against the Dark Arts | ...,31 July 1980,,July,1980,July 1980
1,2,Ronald Bilius Weasley,Male,Student,Gryffindor,"12"" Ash unicorn tail hair",Jack Russell terrier,Human,Pure-blood,Red,Blue,Dumbledore's Army | Order of the Phoenix | Hog...,Wizard chess | Quidditch goalkeeping,1 March 1980,,March,1980,March 1980
2,3,Hermione Jean Granger,Female,Student,Gryffindor,"10¾"" vine wood dragon heartstring",Otter,Human,Muggle-born,Brown,Brown,Dumbledore's Army | Order of the Phoenix | Hog...,Almost everything,"19 September, 1979",,September,1979,September 1979
3,4,Albus Percival Wulfric Brian Dumbledore,Male,Headmaster,Gryffindor,"15"" Elder Thestral tail hair core",Phoenix,Human,Half-blood,Silver| formerly auburn,Blue,Dumbledore's Army | Order of the Phoenix | Hog...,Considered by many to be one of the most power...,Late August 1881,"30 June, 1997",August,1881,August 1881
4,5,Rubeus Hagrid,Male,Keeper of Keys and Grounds | Professor of Care...,Gryffindor,"16"" Oak unknown core",,Half-Human/Half-Giant,Part-Human (Half-giant),Black,Black,Albus Dumbledore | Order of the Phoenix | Hogw...,Resistant to stunning spells| above average st...,6 December 1928,,December,1928,December 1928


In [8]:
# Drop the 'Birth', 'Death', 'Month', and 'Year' columns
df_cleaned = df.drop(columns=['Job','Species','Hair colour','Eye colour', 'Loyalty', 'Skills', 'Birth', 'Death', 'Month', 'Year'])
invalid_text1 = "Beauxbatons Academy of Magic"
invalid_text2 = "Durmstrang Institute"
df_cleaned = df_cleaned.replace([invalid_text1, invalid_text2], "", regex=False)
df_cleaned.head()

Unnamed: 0,Id,Name,Gender,House,Wand,Patronus,Blood status,Month_Year
0,1,Harry James Potter,Male,Gryffindor,"11"" Holly phoenix feather",Stag,Half-blood,July 1980
1,2,Ronald Bilius Weasley,Male,Gryffindor,"12"" Ash unicorn tail hair",Jack Russell terrier,Pure-blood,March 1980
2,3,Hermione Jean Granger,Female,Gryffindor,"10¾"" vine wood dragon heartstring",Otter,Muggle-born,September 1979
3,4,Albus Percival Wulfric Brian Dumbledore,Male,Gryffindor,"15"" Elder Thestral tail hair core",Phoenix,Half-blood,August 1881
4,5,Rubeus Hagrid,Male,Gryffindor,"16"" Oak unknown core",,Part-Human (Half-giant),December 1928


In [9]:
# Rename the 'Month_Year' column to 'Birthdate'
df_cleaned = df_cleaned.rename(columns={'Month_Year': 'Birthdate'})

# Display the cleaned DataFrame to verify the new column name
df_cleaned.head()

Unnamed: 0,Id,Name,Gender,House,Wand,Patronus,Blood status,Birthdate
0,1,Harry James Potter,Male,Gryffindor,"11"" Holly phoenix feather",Stag,Half-blood,July 1980
1,2,Ronald Bilius Weasley,Male,Gryffindor,"12"" Ash unicorn tail hair",Jack Russell terrier,Pure-blood,March 1980
2,3,Hermione Jean Granger,Female,Gryffindor,"10¾"" vine wood dragon heartstring",Otter,Muggle-born,September 1979
3,4,Albus Percival Wulfric Brian Dumbledore,Male,Gryffindor,"15"" Elder Thestral tail hair core",Phoenix,Half-blood,August 1881
4,5,Rubeus Hagrid,Male,Gryffindor,"16"" Oak unknown core",,Part-Human (Half-giant),December 1928


In [10]:
# Save the cleaned DataFrame with the new column name to a new CSV file
df_cleaned.to_csv('Data/Characters_cleaned.csv', sep=';', index=False)
