# Text Analytics: Cars trends in North America

In [1]:
import requests
from bs4 import BeautifulSoup
import csv

1. Get all the cars brands from wikipedia

In [2]:
# Make a request to the website
urlA1 = "https://en.wikipedia.org/wiki/List_of_automobile_manufacturers"
response1 = requests.get(urlA1)

In [3]:
# Parse the HTML content
soup = BeautifulSoup(response1.content, 'html.parser')

In [4]:
# Find all elements with class "div-col"
el = soup.find_all("div", {"class": "div-col"}) 

Bacically, each "list" of companies under a country, or under 'current' or 'defunct' was of the class "div", more specifically "div-col". At this point of the code, it gives more a text of many companies names than an extensive list. We still need to play with this code.

In [5]:
# Extract the names of all car companies
car_companies = [e.text for e in el]
car_companies = ''.join(car_companies)
car_companies = car_companies.split('\n')
car_companies = [x for x in car_companies if x]
car_companies = sorted(car_companies) # sorting them in alphabetical order

In [6]:
# Print the list of car companies
print(car_companies)

['A.S.A.', 'A.T.S.', 'AAD', 'AC', 'ACE EV Group', 'AEC', 'AGA', 'AM General', 'AMC', 'AMZ', 'APIS', 'ARO', 'AVANI', 'AWS', 'AWZ', 'Abadal', 'Abarth', 'Acadian', 'Acura', 'Adam Motor Company', "Adam's Brothers", 'Adams-Farwell', 'Adler', 'Advanced Automotive Design', 'Aero', 'Agrale', 'Aion', 'Aixam', 'Ajanta Group', 'Ajlani Motors', 'Alco', 'Alesbury', 'Alexander Dennis', 'Alfa Romeo', 'All American Racers', 'Allard', 'Almazora Motors', 'Alpina', 'Alpine', 'Alta', 'Alvis', 'Amplex', 'Amur', 'Anadol (defunct)', 'Anasagasti', 'Ansaldo', 'Anteros Coachworks', 'Anziel', 'Apex Motors', 'Apex Motors', 'Apollo', 'Apollo', 'Apperson', 'Aptera', 'Aptera', 'Aquila', 'ArBenz', 'Arcfox', 'Arcimoto', 'Ariel', 'Arrinera', 'Artega', 'Asahi', 'Ashok Leyland', 'Asia MotorWorks', 'Asia Motors', 'Aspark', 'Aspid', 'Aston Martin', 'Asüna', 'Atalanta Motors', 'Attica', 'Atul Auto', 'Auburn', 'Audi', 'Aurea', 'Austin', 'Austin-Healey', 'Austro-Daimler', 'Austro-Tatra', 'Auto Union', 'Auto-Mixte', 'AutoLatin

In [7]:
# Write the list of car companies to a CSV file
with open('car_companies.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Car Companies'])
    for company in car_companies:
        writer.writerow([company])

Now, analyse how cars are being discussed in a random car forum discussion.

In [8]:
## Second script
import pandas as pd

In [9]:
posts=[]
for i in range(1, 200):
    urlA2 = "https://forums.edmunds.com/discussion/3941/general/x/i-spotted-a-new-insert-make-model-today"+"/p"+str(i)
    response2 = requests.get(urlA2)
    soup = BeautifulSoup(response2.text, 'html.parser')

    # Use Beautiful Soup methods to extract the data you need from the page
    comments = soup.find_all('div', {'class': 'Comment'})
    
    for comment in comments:
        userID = comment.find('span', {'class': 'Author'}).text.strip()
        date = comment.find('span',{"class" : "MItem DateCreated"}).text.strip()
        message = comment.find('div', {'class': 'Message userContent'}).text.strip()
        posts.append([userID,date,message])

In [10]:
# create a data frame
df = pd.DataFrame(posts, columns=['userID', 'date', 'message'])
# Save the data frame into a csv file
df.to_csv('forum_comments.csv', columns=['userID', 'date', 'message'], index=False)

In [11]:
#B
import pandas as pd
import re

In [12]:
# create an empty dictionary to store the frequency counts
counts = {}

In [13]:
# iterate through the list of manufacturers
for car in car_companies:
    # use the str.count() method to calculate the frequency of mentions
    count = sum(df['message'].apply(lambda x: len(re.findall(car,x))))
    # add the count to the dictionary
    counts[car] = count

In [14]:
# create a dataframe from the dictionary
df_counts = pd.DataFrame.from_dict(counts, orient='index', columns=['count'])

# sort the frequency counts in descending order
df_counts = df_counts.sort_values(by='count', ascending=False)

# select the top 5 manufacturers
top_5 = df_counts.head(5)

In [15]:
# print the top 5 manufacturers
print(top_5)

        count
BMW       439
Ford      404
Toyota    344
Lexus     337
Honda     334


In [16]:
#C
from itertools import combinations

In [17]:
# create an empty dictionary to store the co-mention counts
co_counts = {}

In [18]:
# iterate through all possible pairs of manufacturers
for car1, car2 in combinations(car_companies, 2):
    # use the str.count() method to calculate the co-mention count
    count = sum(df['message'].apply(lambda x: x.count(car1) * x.count(car2)))
    # add the co-mention count to the dictionary
    co_counts[(car1, car2)] = count

In [19]:
# create a dataframe from the dictionary
df_co_counts = pd.DataFrame.from_dict(co_counts, orient='index', columns=['count'])

In [20]:
# sort the co-mention counts in descending order
df_co_counts = df_co_counts.sort_values(by='count', ascending=False)

In [21]:
# select the top 3 brand pairs
top_3 = df_co_counts.head(3)

In [22]:
print(top_3)

                 count
(Audi, BMW)         86
(Acura, Honda)      82
(Honda, Toyota)     76


In [24]:
# Before moving forward to the next question, we are going to do the following data cleaning
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kenzasqalli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kenzasqalli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
# Tokenize the 'message' column
df['message'] = df['message'].apply(nltk.word_tokenize)

# Remove punctuation
df['message'] = df['message'].apply(lambda x: [word for word in x if word.isalnum()])

# Remove stop words
stop_words = set(stopwords.words('english'))
df['message'] = df['message'].apply(lambda x: [word for word in x if word.lower() not in stop_words])

In [26]:
#D
## Deciding on attributes
from collections import Counter
words = []

for sublist in df['message']:
    words += sublist

In [27]:
word_counts = Counter(words)
df_word_counts = pd.DataFrame.from_dict(word_counts, orient='index', columns=['count'])
df_word_counts = df_word_counts.sort_values(by='count', ascending=False)
top_50 = df_word_counts.head(50)

In [28]:
# List of attributes:
attributes = ['new', 'old', 'black', 'white', 'big']
brands = ['BMW', 'Ford', 'Toyota', 'Lexus', 'Honda']

In [29]:
# Initialize a table with 0 values for all cells
table = pd.DataFrame(columns=attributes, index=brands, data=0)

In [30]:
# Iterate through each message in the data frame
for _, row in df.iterrows():
    # Check if the message contains any of the top 5 brands
    for brand in brands:
        if brand in row['message']:
            # Iterate through each attribute
            for attribute in attributes:
                if attribute in row['message']:
                    # Increment the value in the table for the corresponding brand and attribute
                    table.at[brand, attribute] += 1

In [31]:
# Add a new row at the end of the table named 'Total' to display which attribute is the most mentionned
table.loc['Total'] = table.sum()

# Display the final table
print(table)
# As expected, the attribute "new" is the most seen among the top 5 car brands.

        new  old  black  white  big
BMW     145   18     44     18   20
Ford    119   17     44     29   28
Toyota   71   23     21     18   24
Lexus   110   25     24     21   31
Honda    95   26     24     23   16
Total   540  109    157    109  119


In [32]:
# E
#Create a corpus of words expressing the desire to buy something
desire_to_buy_words = ['acquire', 'buy', 'choose', 'desire', 'get', 'hunt',
                       'invest', 'look', 'need', 'order', 'own', 'pick',
                       'purchase', 'search', 'select', 'shop', 'take', 'want',
                       'wish', 'bargain', 'buyout', 'cash', 'checkout', 'cost',
                       'credit', 'deal', 'demand', 'deposit', 'shop', 'spend',
                       'transact','transaction','budget','afford','crave',
                       'desperate','determine','determined','determinedly',
                       'elect','feasible','finalize','hankering','intend',
                       'intends','like','long','negotiate','opt','pay',
                       'prefer','priced','proceed','procurement','proposed',
                       'provide','scrutinize','settle','solution','spend',
                       'study','tendering','trade','transact']


In [33]:
# Create a dictionary to store the count of each brand
brand_count = {brand: 0 for brand in car_companies}

# Iterate over each row of the dataframe
for _, row in df.iterrows():
    # Get the message from the current row
    message = row['message']
    # Iterate over each word in the message
    for word in message:
        # Check if the word is in the desire_to_buy_words list
        if word in desire_to_buy_words:
            # Iterate over each brand in the car_companies list
            for brand in car_companies:
                # Check if the brand is in the message
                if brand in message:
                    # If it is, increment the count of the brand by 1
                    brand_count[brand] += 1

# Create a new dataframe to store the results
result_df = pd.DataFrame(columns=['Brand', 'Count'])

In [34]:
# Iterate over the brand_count dictionary and add the values to the result_df
for brand, count in brand_count.items():
    result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)

# Sort the dataframe by count in descending order
result_df = result_df.sort_values(by='Count', ascending=False)

# Print the dataframe
print(result_df)

  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)
  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)
  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)
  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)
  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)
  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)
  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)
  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)
  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)
  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)
  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)
  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_inde

           Brand Count
301         Ford   374
103          BMW   318
372        Honda   281
784       Toyota   279
579       Nissan   265
..           ...   ...
311          GAZ     0
312  GAZ (Volga)     0
313          GEM     0
314          GKD     0
868        Škoda     0

[869 rows x 2 columns]


  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)
  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)
  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)
  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)
  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)
  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)
  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)
  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)
  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)
  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)
  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_index=True)
  result_df = result_df.append({'Brand': brand, 'Count': count}, ignore_inde

##### Ford cars are the ones that are most desired to be acquired based on comments in this specific forum. Note that results may differ from a forum to another.

In [35]:
print(result_df)

           Brand Count
301         Ford   374
103          BMW   318
372        Honda   281
784       Toyota   279
579       Nissan   265
..           ...   ...
311          GAZ     0
312  GAZ (Volga)     0
313          GEM     0
314          GKD     0
868        Škoda     0

[869 rows x 2 columns]
