[Electric Vehicle Population Data](https://catalog.data.gov/dataset/electric-vehicle-population-data)<br>
Metadata Updated: July 20, 2024
<br><br>
This dataset shows the Battery Electric Vehicles (BEVs) and Plug-in Hybrid Electric Vehicles (PHEVs) that are currently registered through Washington State Department of Licensing (DOL).

In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer("all-MiniLM-L6-v2")
from sklearn.metrics.pairwise import cosine_similarity
import copy, requests, time, pickle
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
from utils import *
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

data = pd.read_csv("../data/data.gov/modified/dataCAFV.csv")
exempt_list = pd.read_csv("../data/data.gov/WA_Tax_Exemptions_-_Potential_Eligibility_by_Make_Model_Excluding_Vehicle_Price_Criteria_20240730.csv")
with open("review_dict.pkl", "rb") as fp:
    review_dict = pickle.load(fp)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jungakim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def get_text_from_url(url):

    uclient = uReq(url)
    page_html = uclient.read()
    uclient.close()
    
    soup=BeautifulSoup(page_html, "html.parser")
    text=soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    # chunks = text.split("\n")
    
    return text


In [3]:
phev = copy.deepcopy(data.loc[data["Electric Vehicle Type"].map(lambda x: x.endswith("(PHEV)"))])
phev_summary=\
phev.groupby("Make").agg({"CAFV_indicator": "mean", "Model": "count"}).\
rename(columns={"CAFV_indicator": "proportion of CAFV", "Model": "count"}).\
sort_values(["proportion of CAFV", "count"], ascending=False)
phev_summary

Unnamed: 0_level_0,proportion of CAFV,count
Make,Unnamed: 1_level_1,Unnamed: 2_level_1
CHEVROLET,1.0,4833
CHRYSLER,1.0,3484
DODGE,1.0,631
LEXUS,1.0,347
CADILLAC,1.0,88
ALFA ROMEO,1.0,59
FISKER,1.0,14
WHEEGO ELECTRIC CARS,1.0,3
HONDA,0.988152,844
KIA,0.68417,2628


In [4]:
phev_partial=\
phev_summary.loc[~phev_summary["proportion of CAFV"].isin([0., 1.])].index.tolist()

In [5]:
phev.loc[phev["Make"].isin(phev_partial), ["Make", "Model", "Model Year"]].drop_duplicates().iloc[110]

Make            FORD
Model         ESCAPE
Model Year      2023
Name: 2578, dtype: object

In [6]:
# make, model, year = "ford", "escape", 2023
# url=f"https://www.kbb.com/{make}/{model}/{year}/"
# get_text_from_url(url)

In [7]:
len(data.loc[:, ["Make", "Model", "Model Year", "Make-Model-Year"]].drop_duplicates())

515

In [8]:
# # df =\
# # phev.loc[phev["Make"].isin(phev_partial), ["Make", "Model", "Model Year", "Make-Model-Year"]].drop_duplicates()

# df=\
# data.loc[:, ["Make", "Model", "Model Year", "Make-Model-Year"]].drop_duplicates()

# review_dict = {}

# for i, (index, row) in enumerate(df.iterrows(), start=1):

#     print(f"{i}", end='\t')
#     make, model, year = row["Make"].lower(), row["Model"].lower(), row["Model Year"]
#     make, model = "-".join(make.split()), "-".join(model.split())
#     url=f"https://www.kbb.com/"+"/".join([make, model, str(year)]) + "/"
#     time.sleep(1)
#     try:
#         response = get_text_from_url(url)
#         if response.startswith("Car Finder"):
#             make_model="-".join([row["Make"], row["Model"]])
#             response = get_text_from_url(f"https://www.kbb.com/"+"/".join([make, model]) + "/")
            
#         if not response.startswith("Car Finder"):
#             review_dict[row["Make-Model-Year"]] = response
#             print(f"{response[:25]}")
#     except Exception as e:
#         print(e)
        
# # with open('review_dict.pkl', 'wb') as fp:
# #     pickle.dump(review_dict, fp)

In [9]:
len(review_dict)

419

In [10]:
END_SENTENCE=\
"What did you think of this review?"

In [11]:
for elm in review_dict.copy():
    start_index=\
    review_dict[elm].find(END_SENTENCE)

    review_dict[elm]=review_dict[elm][:start_index]

In [12]:
phev_partial_dict = {}

In [13]:
for part in phev_partial:
    for elm in review_dict:
        if elm.startswith(part):
            phev_partial_dict[elm] = review_dict[elm]

In [14]:
nlp_pipe = NLPpipe()
term_doc = nlp_pipe.fit_transform(pd.Series(review_dict), min_count = 3, threshold = -0.5)
# tf_idf = nlp_pipe.transform(reviews, tf_idf = True)
data_lemmatized = nlp_pipe.clean_text
dictionary = create_dictionary(data_lemmatized)

Making bigrams...
Lemmatizing...


### LDA-Mallet

In [15]:
NUM_TOPICS=3

In [16]:
MALLET_FOLDER= 'Mallet-202108'
# MALLET_FOLDER= 'Mallet-2.0.8RC1'
# MALLET_FOLDER= 'Mallet-2.0.8RC3'
# MALLET_FOLDER= 'Mallet-202108'

try:
    mallet_path = f'/Users/jungakim/Desktop/Everything/CourseWork/analytics/{MALLET_FOLDER}/bin/mallet' # update this path
    ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, 
                                                 corpus = term_doc,
                                                 num_topics = NUM_TOPICS, 
                                                 random_seed = 100,
                                                 id2word = dictionary,
                                                 alpha = 'auto')
    
    # Visualize the topics
    model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)
    pyLDAvis.enable_notebook()
    
    vis = pyLDAvis.gensim.prepare(model, term_doc, dictionary)
    pyLDAvis.save_html(vis, f'mallet-{NUM_TOPICS}.html')
    
except Exception as e:
    print(f"Exception {e}")

Exception Command '/Users/jungakim/Desktop/Everything/CourseWork/analytics/Mallet-202108/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input /var/folders/tg/r016s5t1663g7dr5vm02vbyr0000gn/T/6d2fb0_corpus.txt --output /var/folders/tg/r016s5t1663g7dr5vm02vbyr0000gn/T/6d2fb0_corpus.mallet' returned non-zero exit status 127.


### LDA-Standard

In [17]:
NUM_TOPICS=3
lda_model = gensim.models.ldamodel.LdaModel(corpus=term_doc,
                                           id2word=dictionary,
                                           num_topics= NUM_TOPICS, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=2000,
                                           passes=10,
                                           # alpha= 'auto',
                                           per_word_topics=True)

In [18]:
vis_data = pyLDAvis.gensim.prepare(lda_model, term_doc, dictionary, sort_topics=False)
pyLDAvis.save_html(vis_data, f'ev-lda-num-topics-whole-{NUM_TOPICS}.html')

In [None]:
# nlp = NLPpipe()
# nlp.fit(pd.Series(review_dict))
# doc_term = nlp.transform(pd.Series(review_dict))
# doc_term

In [43]:
# url =\
# "https://ev-database.org/compare/newest-upcoming-electric-vehicle#sort:path~type~order=.id~number~desc|rs-price:prev~next=10000~100000|rs-range:prev~next=0~1000|rs-fastcharge:prev~next=0~1500|rs-acceleration:prev~next=2~23|rs-topspeed:prev~next=110~350|rs-battery:prev~next=10~200|rs-towweight:prev~next=0~2500|rs-eff:prev~next=100~350|rs-safety:prev~next=-1~5|paging:currentPage=0|paging:number=10"

# url=\
# "https://ev-database.org/car/2237/Hyundai-IONIQ-5-84-kWh-AWD"

# url=\
# "https://kbb.com/robots.txt"

# make, model, year = "volvo", "s60", "2023"

# url=\
# f"https://www.kbb.com/{make}/{model}/{year}/"

# uclient = uReq(url)

# page_html = uclient.read()
# uclient.close()

# soup=BeautifulSoup(page_html, "html.parser")
# text=soup.get_text()
# # break into lines and remove leading and trailing space on each
# lines = (line.strip() for line in text.splitlines())
# # break multi-headlines into a line each
# chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
# # drop blank lines
# text = '\n'.join(chunk for chunk in chunks if chunk)
# chunks = text.split("\n")
# # print(text)