In [3]:
import pandas as pd
import numpy as np
import difflib
from dataclasses import dataclass

@dataclass()
class FuzzyMerge:
    """
        Works like pandas merge except also merges on approximate matches.
    """
    left: pd.DataFrame
    right: pd.DataFrame
    left_on: str
    right_on: str
    how: str = "inner"
    cutoff: float = 0.3

    def main(self) -> pd.DataFrame:
        temp = self.right.copy()
        temp[self.left_on] = [
            self.get_closest_match(x, self.left[self.left_on]) for x in temp[self.right_on]
        ]

        return self.left.merge(temp, on=self.left_on, how=self.how)

    def get_closest_match(self, left: pd.Series, right: pd.Series) -> str or None:
        matches = difflib.get_close_matches(left, right, cutoff=self.cutoff)

        return matches[0] if matches else None
    
df_car = pd.read_csv("../datasets/cars/cars_dataset.csv", sep=",", header=0)
df_car.drop("Unnamed: 0", axis=1, inplace=True)
df_reviews = pd.read_csv("../datasets/reviews/reviews_dataset.csv", sep=",", header=0)
df_reviews.drop("Unnamed: 0", axis=1, inplace=True)
df_reviews

Unnamed: 0,Review_Date,Author_Name,Vehicle_Title,Review_Title,Review,Rating,Make
0,on 01/17/17 10:49 AM (PST),Bob,2017 Acura RLX Sedan Technology Package 4dr Se...,Great Car,Traded in a 2012 Acura TL . The 2017 Acura R...,5.000,Acura
1,on 07/31/18 10:58 AM (PDT),David,2018 Acura RLX Hybrid Sport Hybrid SH-AWD 4dr ...,Liked the 2014 model and this is even better,Entertainment system may not be intuitive or ...,5.000,Acura
2,on 02/04/10 09:48 AM (PST),dan,2006 Acura RSX Hatchback 2dr Hatchback (2.0L 4...,No more acuras/hondas for me,I bought this car used only to have problems ...,3.000,Acura
3,on 08/19/08 12:11 PM (PDT),David,2006 Acura RSX Hatchback 2dr Hatchback (2.0L 4...,Great Sports Car,"This car is great, pretty fast for a 2.0L inl...",4.875,Acura
4,on 06/09/08 12:43 PM (PDT),Keith,2006 Acura RSX Hatchback 2dr Hatchback (2.0L 4...,Great first sports car,This car is great for a first sports car. It ...,4.875,Acura
...,...,...,...,...,...,...,...
29726,on 03/10/18 14:40 PM (PST),Jonathan,2009 Volvo C30 Hatchback T5 2dr Hatchback (2.5...,My 3rd Volvo,"Sporty, clean lines, well thought out and fun...",5.000,Volvo
29727,on 03/15/10 02:03 AM (PDT),D,2009 Volvo C30 Hatchback T5 2dr Hatchback (2.5...,Great little car,We bought a C30 on our return to the US after...,4.750,Volvo
29728,on 03/01/10 19:06 PM (PST),Paul,2009 Volvo C30 Hatchback T5 2dr Hatchback (2.5...,"Best in Class, Value and Otherwise","Strangely, this exquisite, sophisticated car ...",4.750,Volvo
29729,on 12/19/09 10:41 AM (PST),Terri,2009 Volvo C30 Hatchback T5 2dr Hatchback (2.5...,Just Right,I literally test drove 12 vehicles and kept c...,4.750,Volvo


In [4]:
df_merged = FuzzyMerge(left=df_reviews, right=df_car, left_on="Vehicle_Title", right_on="Vehicle_Title_Partial").main()
# df_merged = pd.read_csv("datasets/merged_datasets.csv", sep=",", header=0)
# df_merged.drop("Unnamed: 0", axis=1, inplace=True)

In [1]:
df_merged

NameError: name 'df_merged' is not defined

In [9]:
df_merged_worked = df_merged
df_merged_worked = df_merged_worked.where(df_merged_worked["Make_x"].str.upper() == df_merged_worked["Make_y"].str.upper())
df_merged_worked.drop("Make_y", axis=1, inplace=True)
df_merged_worked.rename(columns={"Make_x": "Make"}, inplace=True)
df_merged_worked.drop_duplicates(subset="Review", keep="first", inplace=True)
df_merged_worked = df_merged_worked.dropna(axis=0, how="all")
df_merged_worked = df_merged_worked.reset_index()
df_merged_worked.drop("index", axis=1, inplace=True)

In [10]:
df_merged_worked = df_merged_worked.astype({"Year": 'int64', "Engine Cylinders": 'int64', "Number of Doors": 'int64', "highway MPG": 'int64', "city mpg": 'int64', "Popularity": 'int64', "MSRP": 'int64'})
df_merged_worked.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Review_Date            1461 non-null   object 
 1   Author_Name            1461 non-null   object 
 2   Vehicle_Title          1461 non-null   object 
 3   Review_Title           1461 non-null   object 
 4   Review                 1461 non-null   object 
 5   Rating                 1461 non-null   float64
 6   Make                   1461 non-null   object 
 7   Model                  1461 non-null   object 
 8   Year                   1461 non-null   int64  
 9   Engine Fuel Type       1461 non-null   object 
 10  Engine HP              1445 non-null   float64
 11  Engine Cylinders       1461 non-null   int64  
 12  Transmission Type      1461 non-null   object 
 13  Driven_Wheels          1461 non-null   object 
 14  Number of Doors        1461 non-null   int64  
 15  Vehi

In [12]:
df_merged_worked

Unnamed: 0,Review_Date,Author_Name,Vehicle_Title,Review_Title,Review,Rating,Make,Model,Year,Engine Fuel Type,...,Common,Crossover,Exotic,Factory Tuner,Hatchback,High-Performance,Hybrid,Luxury,Performance,Vehicle_Title_Partial
0,on 07/28/10 07:30 AM (PDT),Bob,2010 Acura ZDX Hatchback Advance Package 4dr H...,This is truly a driver's car,I love the fact that with SH-AWD the more gas...,5.000,Acura,ZDX,2013,gasoline,...,False,True,False,False,True,False,False,True,False,2013 Acura ZDX 4dr Hatchback AWD (6cyl)
1,on 09/26/17 10:07 AM (PDT),Brian,2017 Acura RDX SUV 4dr SUV AWD (3.5L 6cyl 6A),Brake System Issue,A brake system issue started after 2 weeks of...,1.000,Acura,MDX,2017,gasoline,...,False,True,False,False,False,False,False,True,False,2017 Acura MDX 4dr SUV AWD (6cyl)
2,on 12/26/16 18:08 PM (PST),Scott,2017 Acura RDX SUV 4dr SUV AWD (3.5L 6cyl 6A),So far so good.,UPDATE 12/27/17 - after owning this car for 1...,4.000,Acura,MDX,2017,gasoline,...,False,True,False,False,False,False,False,True,False,2017 Acura MDX 4dr SUV AWD (6cyl)
3,on 10/18/17 09:25 AM (PDT),Alex,2018 Acura RDX SUV 4dr SUV AWD (3.5L 6cyl 6A),Nice whip!,"Very smooth driving, almost no road noise. Gr...",5.000,Acura,RDX,2016,gasoline,...,False,True,False,False,False,False,False,True,False,2016 Acura RDX 4dr SUV FWD (6cyl)
4,on 04/18/18 11:29 AM (PDT),Lori,2015 Acura RDX SUV 4dr SUV AWD (3.5L 6cyl 6A),Need a button?,There is no interior or exterior button to au...,3.000,Acura,MDX,2015,gasoline,...,False,True,False,False,False,False,False,True,False,2015 Acura MDX 4dr SUV AWD (6cyl)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,on 01/09/10 19:02 PM (PST),Judy,2010 Volvo XC90 SUV V8 4dr SUV AWD (4.4L 8cyl 6A),XC90 THE BEST OF THE REST,"I have owned a 2004, 2007, and now own a 2010...",4.875,Volvo,XC60,2015,gasoline,...,False,True,False,False,False,False,False,True,True,2015 Volvo XC60 4dr SUV FWD (4cyl)
1457,on 09/20/15 18:09 PM (PDT),scott,2015 Volvo XC60 SUV T6 4dr SUV AWD (3.0L 6cyl ...,UDATED Goodbye Lexus Rx350 and Hello Volvo XC60,"Our sticker starts with a T6 AWD, then adds t...",5.000,Volvo,XC60,2015,gasoline,...,False,True,False,False,False,False,False,True,True,2015 Volvo XC60 4dr SUV AWD (6cyl)
1458,on 08/22/15 17:39 PM (PDT),david,2016 Volvo XC60 SUV T5 4dr SUV AWD (2.5L 5cyl ...,Updated Review,Now at just under 45K Miles and just had insp...,5.000,Volvo,XC60,2016,gasoline,...,False,True,False,False,False,False,False,True,False,2016 Volvo XC60 4dr SUV AWD (5cyl)
1459,on 10/17/10 00:00 AM (PDT),Lauren,2011 Volvo C30 Hatchback T5 2dr Hatchback (2.5...,Forget Mini check out Volvo,"I thought I loved my 2007 Mini S convertible,...",4.875,Volvo,C30,2011,gasoline,...,False,False,False,False,True,False,False,True,False,2011 Volvo C30 2dr Hatchback FWD (5cyl)


In [13]:
df_merged_worked.to_csv("../datasets/merged_datasets.csv", sep=",")