In [19]:
import pandas as pd
import re

# Load the dataset
df = pd.read_csv(r"C:\Users\user\Desktop\TH KÖLN\Semester 5\Daten Modellierung\lotr_characters.csv")

# Step 1: Initial Inspection
print("Initial Dataset Info:\n")
print(df.info())



Initial Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 911 entries, 0 to 910
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   birth   704 non-null    object
 1   death   595 non-null    object
 2   gender  768 non-null    object
 3   hair    172 non-null    object
 4   height  98 non-null     object
 5   name    911 non-null    object
 6   race    771 non-null    object
 7   realm   197 non-null    object
 8   spouse  465 non-null    object
dtypes: object(9)
memory usage: 64.2+ KB
None


In [23]:
#Cleaning the Data
# Remove rows with missing values in critical columns
df_cleaned = df.dropna(subset=['birth', 'death', 'gender', 'name', 'race']).copy()

# Standardize names: strip whitespace and convert to title case
df_cleaned['name'] = df_cleaned['name'].str.strip().str.title()

# Clean other columns
columns_to_clean = ['birth', 'death', 'gender', 'race', 'realm', 'spouse']
def clean_text(value):
    if pd.isna(value):
        return None
    return re.sub(r'[^\w\s]', '', value.strip())

df_cleaned[columns_to_clean] = df_cleaned[columns_to_clean].apply(lambda col: col.map(lambda x: clean_text(x) if isinstance(x, str) else x))

# Save the cleaned dataset
df_cleaned.to_csv(r"C:\Users\user\Desktop\TH KÖLN\Semester 5\Daten Modellierung\lotr_characters_cleaned.csv", index=False)

print("Cleaned dataset saved as 'lotr_characters_cleaned.csv'.")



Cleaned dataset saved as 'lotr_characters_cleaned.csv'.


In [29]:
# Analysis: Total Rows and Unique Names
# Count the total number of rows
total_rows = len(df_cleaned)
print("\nAnalysis Results:")
print(f"Total Rows in Dataset: {total_rows}")



Analysis Results:
Total Rows in Dataset: 548


In [39]:
# Analysis: Total Rows and Unique Names
# Count the total number of rows
total_rows = len(df_cleaned)

# Count unique names
unique_names = df_cleaned['name'].nunique()

# Distribution of gender
gender_distribution = df_cleaned['gender'].value_counts()

# Top 5 most common races
race_counts = df_cleaned['race'].value_counts().head(5)

# Print analysis results
print("\nAnalysis Results:")
print(f"Total Rows in Dataset: {total_rows}")
print(f"Total Unique Names: {unique_names}")
print("\nGender Distribution:")
print(gender_distribution)
print("\nTop 5 Most Common Races:")
print(race_counts)



Analysis Results:
Total Rows in Dataset: 548
Total Unique Names: 548

Gender Distribution:
gender
Male                469
Female               76
Males                 1
Most likely male      1
male                  1
Name: count, dtype: int64

Top 5 Most Common Races:
race
Men        316
Hobbits     83
Elves       49
Dwarves     34
Ainur       21
Name: count, dtype: int64


In [41]:
# Documentation: Markdown File
documentation = f"""# Data Cleaning and Analysis Documentation

## Cleaning Steps
1. Loaded the dataset `lotr_characters.csv`.
2. Inspected the dataset for missing or inconsistent data.
3. Removed rows with missing values in critical columns: `birth`, `death`, `gender`, `name`, `race`.
4. Standardized the `name` column by stripping whitespace and converting to title case.
5. Cleaned other columns (`birth`, `death`, `gender`, `race`, `realm`, `spouse`) by removing non-alphanumeric characters and leading/trailing whitespace.
6. Saved the cleaned dataset as `lotr_characters_cleaned.csv`.

## Analysis Results
### Total Rows in Dataset
- {total_rows}

### Total Unique Names
- {unique_names}

### Gender Distribution
{gender_distribution.to_string()}

### Top 5 Most Common Races
{race_counts.to_string()}

"""

In [43]:
# Save the Markdown file
with open(r"C:\Users\user\Desktop\TH KÖLN\Semester 5\Daten Modellierung\data_cleaning.md", 'w') as f:
    f.write(documentation)

print("Markdown documentation saved as 'data_cleaning.md'.")


Markdown documentation saved as 'data_cleaning.md'.
