# Data extraction and cleaning

In [20]:
import os
from dotenv import load_dotenv
import pandas as pd
from tqdm import tqdm

In [2]:
load_dotenv()

True

## Used car and related data

Data sources:
- Used car data: [Kaggle](https://www.kaggle.com/datasets/shubham1kumar/usedcar-data?select=UserCarData.csv)
- GDP data: [Federal Reserve Bank of Philadelphia](https://www.philadelphiafed.org/surveys-and-data/real-time-data-research/gdpplus)
- Global Supply Chain Pressure Index: [Federal Reserve Bank of New York](https://www.newyorkfed.org/research/policy/gscpi#/overview)
- VIN information: [Wikibooks](https://en.wikibooks.org/wiki/Vehicle_Identification_Numbers_(VIN_codes)/World_Manufacturer_Identifier_(WMI)) & [Wikibooks](https://en.wikibooks.org/wiki/Vehicle_Identification_Numbers_(VIN_codes)/Model_year)

In [4]:
df_uc = pd.read_csv('raw/vehicles.csv', parse_dates=['posting_date'])
df_gdp = pd.read_excel('raw/gdpplus.xlsx')
df_gscpi = pd.read_excel('raw/gscpi_data.xlsx', sheet_name='GSCPI Monthly Data', names=['date', 'gscpi'])
df_manufacturers = pd.read_excel('raw/manufacturers.xlsx')

In [83]:
df_uc.sample(3)

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,size,type,paint_color,image_url,description,county,state,lat,long,posting_date
355875,7307797320,https://knoxville.craigslist.org/cto/d/rockfor...,knoxville,https://knoxville.craigslist.org,5900,2021.0,nissan,rogue,good,,...,,SUV,white,https://images.craigslist.org/00e0e_2IDRXJeZle...,Car information: 2011 Nissan Rogue Mileage: 14...,,tn,35.858942,-83.953357,2021-04-16 13:16:05-04:00
102559,7313632270,https://ocala.craigslist.org/ctd/d/alachua-201...,ocala,https://ocala.craigslist.org,34978,2018.0,ford,f150 4x4 xlt 4wd f-150,,,...,,,,https://images.craigslist.org/00u0u_14pPcqyUAM...,2018 FORD F150 XLT 4WD / F-150 4X4 TRUCK ~ Hun...,,fl,29.803281,-82.52738,2021-04-27 20:27:55-04:00
329917,7302796563,https://harrisburg.craigslist.org/ctd/d/hummel...,harrisburg,https://harrisburg.craigslist.org,16500,2011.0,ford,f150 ext cab 5.0 4x4,good,8 cylinders,...,,,black,https://images.craigslist.org/00c0c_jVmoOHjQtw...,"This truck has the 5.0 V8, xlt, 4X4. Has power...",,pa,40.8224,-76.8683,2021-04-06 16:23:15-04:00


In [84]:
df_uc.drop(columns=['url', 'region_url', 'image_url', 'description', 'lat', 'long', 'title_status', 'size', 'county'], inplace=True)

In [85]:
df_uc = df_uc.loc[df_uc['posting_date'].notnull()]

In [86]:
df_uc['posting_date'] = pd.to_datetime(df_uc['posting_date'], utc=True).dt.date

In [87]:
df_uc.sample(3)

Unnamed: 0,id,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,transmission,VIN,drive,type,paint_color,state,posting_date
423851,7311555799,milwaukee,4900,2007.0,chevrolet,tahoe,,,gas,1.0,automatic,1GNFC13J77R114294,fwd,other,,wi,2021-04-23
307426,7308703228,stillwater,29590,2013.0,gmc,sierra 1500 extended cab slt,good,8 cylinders,gas,37888.0,other,1GTR1WE01DZ407455,,pickup,silver,ok,2021-04-18
333571,7313665645,philadelphia,5195,2008.0,nissan,altima,good,4 cylinders,gas,134923.0,automatic,1N4AL21E38N441627,fwd,sedan,grey,pa,2021-04-28


In [88]:
df_uc.to_csv('data/used_car.csv', index=False)

In [6]:
df_gdp.head(3)

Unnamed: 0,OBS_YEAR,OBS_QUARTER,OBS_QUARTER2,RECBARS,GRGDP_DATA,GRGDI_DATA,GDPPLUS_DATA
0,1960,1,0.0,0,8.90076,8.80472,5.21626
1,1960,2,0.25,1,-2.16667,-0.068308,1.26533
2,1960,3,0.5,1,1.96191,0.022771,-0.318666


In [7]:
df_gdp = df_gdp.groupby('OBS_YEAR').agg({'GRGDP_DATA': 'mean'}).reset_index().rename(columns={'OBS_YEAR': 'year', 'GRGDP_DATA': 'gdp_growth'})

In [8]:
df_gdp['gdp_growth'] = df_gdp.gdp_growth/100

In [9]:
df_gdp.to_csv('gdp.csv', index=False)

In [10]:
df_gscpi.head(3)

Unnamed: 0,date,gscpi
0,28-Feb-1998,-0.43558
1,31-Mar-1998,-0.060024
2,30-Apr-1998,-0.118535


In [11]:
df_gscpi['date'] = pd.to_datetime(df_gscpi.date)

In [12]:
df_gscpi['year'] = df_gscpi.date.apply(lambda x: x.year)

In [13]:
df_gscpi = df_gscpi.groupby('year').agg({'gscpi': 'mean'}).reset_index()

In [14]:
df_uc = df_uc.merge(df_gdp, left_on='year', right_on='year', how='left').merge(df_gscpi, left_on='year', right_on='year', how='left')

In [15]:
df_uc.to_csv('used_car.csv', index=False)

In [7]:
cohere_api_key = os.getenv('API_KEY_COHERE')

In [25]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_cohere import ChatCohere
from pydantic import BaseModel, Field,  ValidationError, field_validator
from typing import Optional


llm = ChatCohere(temperature=0.0, cohere_api_key=cohere_api_key)

# Define your desired data structure.
class Manufacturer(BaseModel):
    user_input: str = Field(description="The user input that contains the car manufacturer name.")
    manufacturer: Optional[str] = Field(
        description="This is the short lowercased name of the manufacturer retrieved from the user input.")
    
    @field_validator('manufacturer')
    def validate_manufacturer(cls, v):
        if v and len(v) < 3:
            raise ValueError('Manufacturer name must be at least 3 characters long.')
        return v

# Set up a parser + inject instructions into the prompt template.
structured_llm = llm.with_structured_output(Manufacturer)


system = """You will be given a series of car manufacturer names from a user.
Often the input includes additional 
information such as the location where the car was manufactured,
the body style, the model or also in which country the manufacturer is located.
Use your world knowledge to only retrieve the short 
and crisp name of the car manufacturer in lowercase letters 
and dash seperated if the name consists of more than one word.

Here are some examples of user inputs and the expected manufacturer names:

example_input: "Mercedes Benz truck & bus (Argentina)"
example_output: {{"user_input": "Mercedes Benz truck & bus (Argentina)", "manufacturer": "mercedes-benz"}}

example_input: "'Toyota Motor Europe (based in Belgium) used for Toyota ProAce, Toyota ProAce City and Toyota ProAce Max made by PSA/Stellantis'"
example_output: {{"user_input": "'Toyota Motor Europe (based in Belgium) used for Toyota ProAce, Toyota ProAce City and Toyota ProAce Max made by PSA/Stellantis'", "manufacturer": "toyota"}}

example_input: "Tesla, Inc. (US-built MPVs (e.g. Model X, Model Y))"
example_output: {{"user_input": "Tesla, Inc. (US-built MPVs (e.g. Model X, Model Y))", "manufacturer": "tesla"}}
"""

prompt = ChatPromptTemplate.from_messages([("system", system), ("human", "{input}")])

few_shot_structured_llm = prompt | structured_llm

In [26]:
results = []

In [34]:
wmis = df_manufacturers.wmi.values.tolist()
for wmi in tqdm(wmis[377+190:]):
    user_input = df_manufacturers.loc[df_manufacturers.wmi == wmi, 'manufacturer_long'].values[0]
    try:
        output = few_shot_structured_llm.invoke(user_input).model_dump()
        output['wmi'] = wmi
        results.append(output)
    except ValidationError as e:
        print(f"{wmi}: {e}") 

  0%|          | 0/1746 [00:00<?, ?it/s]


TooManyRequestsError: status_code: 429, body: data=None message="You are using a Trial key, which is limited to 1000 API calls / month. You can continue to use the Trial key for free or upgrade to a Production key with higher rate limits at 'https://dashboard.cohere.com/api-keys'. Contact us on 'https://discord.gg/XW44jPfYJu' or email us at support@cohere.com with any questions"

In [37]:
df_res = pd.DataFrame(results)

In [44]:
df_result = df_res[['wmi', 'manufacturer']]

In [53]:
df_result['manufacturer'] = df_result['manufacturer'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_result['manufacturer'] = df_result['manufacturer'].apply(lambda x: x.lower())


In [56]:
df_result.set_index('wmi')['manufacturer'].to_dict()

{'AAA': 'audi',
 'AAK': 'faw',
 'AAM': 'man',
 'AAP': '',
 'AAV': 'volkswagen',
 'AAW': 'challenger-trailer',
 'AA9': 'tr-tec',
 'CN1': 'tr-tec',
 'ABJ': 'mitsubishi',
 'ABM': 'bmw',
 'ACV': 'isuzu',
 'AC5': 'hyundai',
 'ADB': 'mercedes-benz',
 'ADD': '',
 'ADM': 'general-motors',
 'ADN': 'nissan',
 'ADR': 'renault',
 'ADX': 'tata',
 'AFA': '',
 'AFB': 'mazda',
 'AFD': 'baic',
 'AHH': 'hino',
 'AHM': 'mercedes-benz',
 'AHT': 'toyota',
 'BF9/': 'kibo',
 'BUK': 'kiira-motors-corporation',
 'BR1': 'mercedes-benz',
 'EBZ': 'nizhekotrans',
 'DF9/': 'laraki',
 'HA0': 'wuxi-sundiro-electric-vehicle-co-ltd',
 'HA6': 'niu technologies',
 'HA7': 'jinan-qingqi-kr-motors-co-ltd',
 'HES': 'smart',
 'HGL': 'farizon-auto',
 'HGX': 'wuling',
 'HJR': 'jetour',
 'HL4': 'morini',
 'HRV': 'beijing-henrey',
 'HZ2': 'taizhou-zhilong-technology-co-ltd',
 'H0D': 'taizhou-qianxin-vehicle-co-ltd',
 'JAA': 'isuzu',
 'JAB': 'isuzu',
 'JAC': 'isuzu',
 'JAE': 'acura',
 'JAL': 'isuzu',
 'JAM': 'isuzu',
 'JA3': 'mits

In [55]:
df_result

Unnamed: 0,wmi,manufacturer
0,AAA,audi
1,AAK,faw
2,AAM,man
3,AAP,
4,AAV,volkswagen
...,...,...
697,MM8,mazda
698,MNA,ford
699,MNB,ford
700,MNC,ford


## Insurance Claims data

Data source:

- Claims descriptions: [GitHub](https://github.com/Mahesh3394/Claim-Description-Classification)

In [11]:
df_claims = pd.read_excel('raw/Dataset_Public.xlsx')

In [12]:
df_claims.head(3)

Unnamed: 0,Claim Description,Coverage Code,Accident Source
0,THE IV WAS MAKING A LEFT TURN ON A GREEN ARROW...,AN,"Struck pedestrian, bicycle"
1,CLAIMANT ALLEGES SHE SUFFERED INJURIES IN AN E...,GB,Elevator/Escalator
2,"IV PASSENGER SUSTAINED INJURIES, OV AND IV COL...",AB,Sideswipe or lane change


In [13]:
df_claims = df_claims.drop(columns=['Coverage Code', 'Accident Source']).rename(columns={'Claim Description': 'claim_description'})

In [14]:
df_claims.to_csv('claims.csv', index=False)

## Anual reports

In [15]:
from bs4 import BeautifulSoup as soup
import requests as r
import pandas
import time
import os
import datetime
import random
import shutil
from string import punctuation