In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import os

import mysql.connector

In [None]:
mysql_password = os.environ.get("MYSQL_PASSWORD")
mydb = mysql.connector.connect(
    host="localhost",
    user="root",
    password=mysql_password
)
mycursor = mydb.cursor(dictionary=True)

mycursor.execute("USE webscraper;")

In [None]:
manufacturer_bar_data = []
# finding the amount of each manufacturer from the database
my_query = """
SELECT manufacturer_name, COUNT(*) as count
FROM (
    SELECT dataset.manufacturer_id, manufacturers.manufacturer_name
    FROM dataset
    LEFT JOIN manufacturers ON dataset.manufacturer_id = manufacturers.manufacturer_id
) AS joined_data
GROUP BY manufacturer_name
ORDER BY count DESC;
"""
mycursor.execute(my_query)
results = mycursor.fetchall()
for row in results:
    manufacturer_bar_data.append(row)

# splitting the manufacturer_ids from their respective counts
manufacturer_names = np.array([entry['manufacturer_name'] for entry in manufacturer_bar_data])
counts = np.array([entry['count'] for entry in manufacturer_bar_data])

# plotting the bar chart
plt.figure(figsize=(15,7))
bars = plt.bar(manufacturer_names, counts)
for bar, count in zip(bars, counts):
    plt.text(bar.get_x() + bar.get_width() / 2, count, str(count), ha='center', va='bottom')
plt.title("Distribution of manufacturers by volume")
plt.xlabel("Manufacturers"), plt.ylabel("Amount of listings"), plt.xticks(rotation=90)
plt.show()

In [None]:
model_bar_data = []
# finding the top 25 models by amount from the database
my_query = """
SELECT model_name, COUNT(*) as count
FROM (
    SELECT dataset.model_id, models.model_name
    FROM dataset
    LEFT JOIN models ON dataset.model_id = models.model_id
) AS joined_data
GROUP BY model_name
ORDER BY count DESC
LIMIT 25;
"""
mycursor.execute(my_query)
results = mycursor.fetchall()
for row in results:
    model_bar_data.append(row)

# splitting the model_id from their respective counts
model_names = np.array([entry['model_name'] for entry in model_bar_data])
counts = np.array([entry['count'] for entry in model_bar_data])

# plotting the bar chart
plt.figure(figsize=(15,7))
bars = plt.bar(model_names, counts)
for bar, count in zip(bars, counts):
    plt.text(bar.get_x() + bar.get_width() / 2, count, str(count), ha='center', va='bottom')
plt.title("Top 25 models by volume")
plt.xlabel("Model name"), plt.ylabel("Amount of listings")
plt.xticks(rotation=90)
plt.show() 

In [None]:
# distribution of fuel types for each manufacturer
fuel_type_query = """
SELECT m.manufacturer_name, f.fuel_type, 
       count(*) as occurrences
       FROM webscraper.dataset d
			INNER JOIN manufacturers m ON m.manufacturer_id = d.manufacturer_id
			INNER JOIN fuel_types f ON f.fuel_id = d.fuel_type_id
group by d.manufacturer_id, fuel_type_id
order by manufacturer_name ASC, fuel_type ASC;
"""

mycursor.execute(fuel_type_query)
results = mycursor.fetchall()

manufacturer_fuel_data = {}
for manufacturer in manufacturer_names:
    manufacturer_fuel_data[manufacturer] = {}
    
for entry in results:
    manufacturer = entry['manufacturer_name']
    fuel_type = entry['fuel_type']
    occurrences = entry['occurrences']

    if fuel_type not in manufacturer_fuel_data[manufacturer]:
        manufacturer_fuel_data[manufacturer][fuel_type] = occurrences
    else:
        manufacturer_fuel_data[manufacturer][fuel_type] += occurrences

for manufacturer, fuel_data in manufacturer_fuel_data.items():
    fuel_types = fuel_data.keys()
    occurrences = fuel_data.values()
    
    x = range(len(fuel_types))
    
    plt.figure(figsize=(8, 6))
    plt.bar(x, occurrences, tick_label=fuel_types)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()