In [33]:
from pathlib import Path
import pandas as pd
import numpy as np
from string import punctuation

In [34]:
dataset_path = Path().absolute()

In [35]:
def read_dataset(file_path: Path) -> pd.DataFrame :

    return pd.read_csv(file_path, header = 0, names = ["username", "month_year", "post"])

In [36]:
edmunds = read_dataset(dataset_path / "edmunds_extraction.csv")

In [37]:
edmunds.head(5)

Unnamed: 0,username,month_year,post
0,plekto,April 2007,Of course the Buick LaCrosse qualifies. It ce...
1,punkr77,April 2007,One thing that drops the Maxima off my shoppi...
2,backy,April 2007,Good point. Another way Nissan differentiates...
3,lilengineerboy,April 2007,"Elroy,Autocross is typically a ""course"" confi..."
4,jeffyscott,April 2007,"Like I said earlier, if BMW can have the ride..."


In [38]:
edmunds.shape

(4899, 3)

In [39]:
models = pd.read_csv("models.csv", header = 0, names = ["brand", "model"])

In [40]:
models.head(5)

Unnamed: 0,brand,model
0,acura,Legend
1,acura,vigor
2,acura,rlx
3,acura,ILX
4,acura,MDX


In [41]:
# removing any leading or trailing punctuations from the brand name column
models['brand'] = models['brand'].apply(lambda x: x.strip(punctuation))

In [42]:
# getting all of the unique brand names into a list
distinct_brands: list[str] = list(models['brand'].unique())

In [43]:
popular_brands: list[str] = [
    "Toyota", "Volkswagen", "Ford", "Honda", "Chevrolet", "BMW", "Mercedes-Benz",
    "Nissan", "Audi", "Hyundai", "Lexus", "Kia", "Mazda", "Dodge", "Porsche",
    "Subaru", "Buick", "Cadillac", "Chrysler", "Ferrari", "GMC", "Infiniti",
    "Jaguar", "Jeep", "Land Rover", "Mitsubishi", "Acura", "Aston Martin", "Bentley",
    "Rolls Royce", "Alfa Romeo", "Citroen", "Fiat", "Lamborghini", "Lotus",
    "Maserati", "Mini", "Peugeot", "Renault", "Saab", "Suzuki", "Tesla", "Volvo",
    "Opel", "Skoda", "Seat", "Daihatsu", "Isuzu", "Smart", "Bugatti", "Koenigsegg",
    "Pagani", "Morgan", "TVR", "Lancia", "Genesis", "Rover", "Lincoln", "Scion",
    "Mahindra", "Tata", "Maruti Suzuki", "Proton", "Perodua", "Daewoo", "Zotye",
    "Chery", "Geely", "Dacia", "Holden", "Vauxhall", "SsangYong", "FAW", "BYD",
    "Great Wall", "Karmann", "Panhard", "Haval", "DS Automobiles", "Lada", "SEAT",
    "Ram", "Gumpert", "Spyker", "McLaren", "Hummer", "Pontiac", "Saturn", "Oldsmobile",
    "Plymouth", "DeLorean", "Studebaker", "Austin", "Triumph", "MG", "Hillman",
    "Willys", "Riley", "Simca", "Talbot"
]


In [44]:
popular_brands = [x.lower() for x in popular_brands]

In [45]:
# performing a frequency count of the brands


# changing all user posts to lowercase
edmunds['post'] = edmunds['post'].apply(lambda x : x.lower())

# store the brand mentions in a new pandas data frame
edmunds_brands: pd.DataFrame = pd.DataFrame(index = popular_brands, columns = ['mentions'])

edmunds_brands['mentions'] = 0

for brand in popular_brands:

    edmunds_brands.at[brand, 'mentions'] = edmunds['post'].str.contains(brand, na = False).sum()

In [47]:
top_10_brands = edmunds_brands.sort_values(by = 'mentions', ascending = False).head(10)

In [48]:
top_10_brands

Unnamed: 0,mentions
honda,941
ford,776
mazda,596
toyota,392
hyundai,233
seat,227
seat,227
nissan,227
ram,129
bmw,110
