# Japanese data cleaning

## SUUMO properties

In [None]:
import pandas as pd

df = pd.read_csv("suumo-properties.csv")
df.head()

How much is the rent?

In [None]:
df['cleaned_rent'] = df['Rent'].str.replace("万円", "").astype(float)
df['cleaned_rent'].describe()

How large are these properties?

In [None]:
df['cleaned_area'] = df['Area Size'].str.replace("m2", "").astype(float)
df.head()

What is the relationship between size and rent?

In [None]:
df.plot(kind='scatter', x='cleaned_area', y='cleaned_rent')

In [None]:
df[['cleaned_area', 'cleaned_rent']].corr()

What direction do they face?

In [None]:
df['Direction'].value_counts()

What does the floor plan look like?

In [None]:
df['Floor Plan'].value_counts()

Which ones have living rooms?

In [None]:
df[df['Floor Plan'].str.contains("L")]

Extract the number of bedrooms

In [None]:
df['Floor Plan'] = df['Floor Plan'].replace("ワンルーム", '1')
df['Floor Plan'].value_counts()

In [None]:
df['bedrooms'] = df['Floor Plan'].str[0].astype(int)
df.head()

How far are they from each station?

In [None]:
df['Station Info'].iloc[0]

In [None]:
df['station_distance'] = df['Station Info'].str.findall("(.*) 歩(.*)分")
df.head().to_clipboard()

In [None]:
for i, row in df.iterrows():
    for station, minutes in row['station_distance']:
        df.at[i, station] = int(minutes)

In [None]:
pd.options.display.max_columns = None

df

# Mercari products

In [None]:
import pandas as pd

df = pd.read_csv("mercari-subset.csv")
df.head()

## Install everything to use Instructor

In [None]:
%pip install --quiet --upgrade instructor openai tqdm pydantic

## Set things up

In [None]:
import instructor
from pydantic import BaseModel, Field, field_validator
from openai import OpenAI
from typing import Optional, List
from typing_extensions import Literal
import re
from tqdm.auto import tqdm

# Allow a progress bar
tqdm.pandas()
# Connect to ChatGPT
client = instructor.from_openai(OpenAI(api_key='sk-proj-XXXXX'))

### Extracting data from a user comment

In [None]:
comment = """
I am very angry about the broccoli incident, I am never shopping at your 
store again. Take me off of your broccoli mailing list immediately.

Jackary Baloneynose
jackary.baloney@example.com
"""

In [None]:
class Comment(BaseModel):
    name: str = Field(description="Customer name")
    email: str = Field(description="Customer email")
    food: str = Field(description="Food item")
    date: Optional[str] = Field(description="Comment date, if known, in format YYYY-MM-DD")
    sentiment: Literal["positive", "negative", "unknown"] = Field(description="Comment sentiment")

In [None]:
result = client.chat.completions.create(
    model="gpt-4o-mini",
    response_model=Comment,
    messages=[
        {"role": "user", "content": comment}
    ],
    temperature=0,
    max_retries=3
)

result.model_dump()

### Extracting data similar to our dataset 

In [None]:
product = """
YAMAHA DTXTREME モジュール　電子ドラム (2)の画像 21,500円
"""

In [None]:
class Product(BaseModel):
    name: str = Field(description="Product name")
    brand: str = Field(description="Product brand")
    instrument_type: str = Field(description="Instrument type")
    product_code: str = Field(description="Model number")
    instrument_category: Literal["brass", "string", "electronic", "percussion/drum", "piano/keyboard", "other"] = Field(description="Type of instrument")

In [None]:
result = client.chat.completions.create(
    model="gpt-4o-mini",
    response_model=Product,
    messages=[
        {"role": "user", "content": product}
    ],
    temperature=0,
    max_retries=3
)

result.model_dump()

## Extracting from our dataframe

In [None]:
import pandas as pd

df = pd.read_csv("mercari-subset.csv", thousands=',')
df.head()

In [None]:
import time

def get_details(row):
    time.sleep(1) # Wait one second
    product = row['Product Name']

    result = client.chat.completions.create(
        model="gpt-4o-mini",
        response_model=Product,
        messages=[
            {"role": "user", "content": product}
        ],
        temperature=0,
        max_retries=3
    )
    
    return result.model_dump()

details = df.progress_apply(get_details, axis=1)
details

In [None]:
details_df = pd.json_normalize(details)
details_df

In [None]:
merged = df.join(details_df)
merged.head()

In [None]:
merged['brand'].value_counts()