# Cleaning string data in Dataframes
## Example 1


In [None]:
import pandas as pd

df = pd.DataFrame({
    "name": [" Alice ", "BOB", "charlie  ", "  dAvId","EVa"],
    "state": ["Virginia  ", "New  york", "  new york ", " New York  ", "ny "],
    "ticket_price": [" $ 100 %", "$200", " __$150", "$ 300  ", "  $,300"]
})

print("Original Data:")
print(df)

print(df.dtypes)

### Step 1: Inspect Unique Values

In [None]:
print("Unique states BEFORE cleaning:")
print(df["state"].unique())


### Step 2: Remove Leading/Trailing Spaces

In [None]:

df["name"] = df["name"].str.strip()
df["state"] = df["state"].str.strip()
df["ticket_price"] = df["ticket_price"].str.strip()

print("After .str.strip():")
print(df)


### Step 3: Remove Extra Spaces Inside Text

In [None]:

df["name"] = df["name"].str.replace(r"\s+", " ", regex=True)
df["state"] = df["state"].str.replace(r"\s+", " ", regex=True)

print("After fixing internal spaces:")
print(df)


### Step 4: Standardize Capitalization

In [None]:

df["name"] = df["name"].str.title()   # Proper capitalization
df["state"] = df["state"].str.upper()

print("After standardizing capitalization:")
print(df)


In [None]:
# Check Unique Again
print("Unique states AFTER cleaning:")
print(df["state"].unique())

In [None]:
# Notice NY and NEW YORK are the same
# Replace Inconsistent Categories and Now unify state names.

df["state"] = df["state"].replace({
    "NY": "NEW YORK"
})

print("After fixing inconsistent categories:")
print(df)


### Step 5: Remove Unwanted Symbols (e.g., $) and convert the price to numeric

In [None]:

df["ticket_price"] = df["ticket_price"].str.replace("$", "", regex=False)
df["ticket_price"] = df["ticket_price"].str.replace(r"\s+", "", regex=True)

print("After removing $ and extra spaces:")
print(df)

In [None]:
# Notice there is a ",300" and __150
# we can also use regular expression to capture the numeric values
import re

price_pattern = re.compile(r"(\d+)")

df["ticket_price"] = df["ticket_price"].str.extract(price_pattern)
print(df)

In [None]:

df["ticket_price"] = pd.to_numeric(df["ticket_price"], errors="coerce")

print("Final cleaned data:")
print(df)



In [None]:
print(df.dtypes)