In [1]:
import pandas as pd

In [2]:
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

- punkt: The ‘punkt’ package is a pre-trained model that helps the library to split text into sentences and words efficiently.
- ‘stopwords’: This is the identifier for the stopwords corpus that you want to download. This corpus contains lists of stopwords for several languages.
- ‘wordnet’: This is the identifier for the WordNet lexical database that you want to download. It is a valuable resource for tasks such as semantic reasoning, natural language understanding, and language translation.

In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kaveh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kaveh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kaveh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# import spacy_transformers
import preprocessor as p

In [5]:
import torch
import spacy
from spacy.util import minibatch, compounding
# import spacy_transformers
from spacy.tokens import Doc
from spacy.training import Example
from spacy.training.example import Example

In [6]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

# https://github.com/NeelShah18/emot/blob/master/emot/emo_unicode.py  emoticons list
# https://github.com/rishabhverma17/sms_slang_translator/blob/master/slang.txt Chat shortcuts

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r' ', text)

def remove_punctuations(text):
    punctuations = re.compile(r'[~`!@#$%^&*(,<،>){}\\/|\'"?؟_+-=~\[\]]')
    return punctuations.sub(r' ', text)

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r' ', text)

def remove_weird_chars(text):
    weridPatterns = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u'\U00010000-\U0010ffff'
                               u"\u200d"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\u3030"
                               u"\ufe0f"
                               u"\u2069"
                               u"\u2066"
                               u"\u200c"
                               u"\u2068"
                               u"\u2067"
                               "]+", flags=re.UNICODE)
    patterns = [re.compile('\r'), re.compile('\n'), re.compile('&amp;')]
    text = weridPatterns.sub(r'', text)
    for p in patterns:
        text = p.sub(r' ', text)
    return text

def remove_extra_repeated_alpha(text):
    """
    Remove extra repeated alphabets in a word
    check these links:
    demo : https://regex101.com/r/ALxocA/1
    Question: https://bit.ly/2DoiPqS
    """
    return re.sub(r'([^\W\d_])\1{2,}', r'\1', text)

In [58]:
def first_step_cleaning(text):
    # text = text.encode('utf-8')
    # Remove extra spaces
    text = re.sub('\s+', ' ', str(text)).strip()
    # text = text.replace('-',' ')
    # Remove numbers
    # text = re.sub('\d+', '', text)
    # Case normalization
    text = text.lower()
    # text = p.clean(text).replace("Image","").replace("profile picture","").replace("  ","")
    # Remove URLs, HTML tags, and non-alphanumeric characters
    text = text.replace("\\xc2\\xa9","")
    text = text.replace("\n","")
    text = text.replace("..","")
    text = text.replace("     ","")
    text = text.replace("    ","")
    text = text.replace("   ","")
    text = text.replace("  ","")
    text = text.replace(" – "," ")
    text = text.replace("\x0c","")
    text = text.replace(r"\u0107","")
    text = text.replace(r"'","")
    text = text.replace(r'"','')
    text = text.replace(rem_01,"")
    text = text.replace(rem_02,"")
    text = text.replace(rem_03,"")
    for i in range(50):
        text = text.replace("\n{}".format(i),"")
    for i in range(500):
        text = text.replace(r"\u0{}".format(i),"")
    # text = re.sub(r'http\S+', '', single_string)
    # text = re.sub('<[^>]*>', '', text)
    # text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = remove_urls(text)
    text = remove_html(text)
    text = remove_weird_chars(text)
    text = remove_extra_repeated_alpha(text)
    # Remove punctuations
    # text = "".join([char for char in text if char not in string.punctuation])
    return text

In [8]:
def second_step_cleaning_tokenizing(text):
    # Remove all punctuation except the ones to keep
    translator = str.maketrans('', '', string.punctuation.replace('/', '').replace('-',''))
    text = text.translate(translator)
    # Tokenization
    tokens = word_tokenize(text)

    # Stopword removal
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = SnowballStemmer("english")
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

    # Lemmatization
    wnl = WordNetLemmatizer()
    lemmatized_tokens = [wnl.lemmatize(word) for word in stemmed_tokens]

    return lemmatized_tokens

In [9]:
raw_df = pd.read_csv(r"Mohammad_Karimadini_Crawling.csv")
# raw_df.drop(columns=["Unnamed: 0"],inplace = True)
raw_df

Unnamed: 0,"12. Do we need different measures to ensure all segments of the road transport sector are able to reduce emissions, and if so what government and industry measures might well support the uptake of electric bikes, micro-mobility and motorbikes?",8. Would vehicle fuel efficiency standards incentivise global manufacturers to send EVs and lower emission vehicles to Australia?,3. What are suitable indicators to measure if we are on track to achieve our goals and objectives?,15. What actions can governments and industry take to strengthen our competitiveness and innovate across the full lifecycle of the EV value chain?,16. How can we expand our existing domestic heavy vehicle manufacturing and assembly capability?,1. Do you agree with the objectives and do you think they will achieve our proposed goals? Are there other objectives we should consider?,6. What information could help increase demand and is Government or industry best placed to inform Australians about EVs?,7. Are vehicle fuel efficiency standards an effective mechanism to reduce passenger and light commercial fleet emissions?,What area best describes where you live?,18. Are there other proposals that could help drive demand for EVs and provide a revenue source to help fund road infrastructure?,...,What state or territory do you live in?,5. Over what timeframe should we be incentivising low emission vehicles as we transition to zero emission vehicles?,17. Is it viable to extend Australian domestic manufacturing and assembly capability to other vehicle classes?,"10. What design features should the Government consider in more detail for vehicle fuel efficiency standards, including level of ambition, who they should apply to, commencement date, penalties and enforcement?",20. How can we best make sure all Australians get access to the opportunities and benefits from the transition?,Make a general comment,"9. In addition to vehicle fuel efficiency standards for passenger and light commercial vehicles, would vehicle fuel efficiency standards be an appropriate mechanism to increase the supply of heavy vehicle classes to Australia?",Titles,number_of_q_answered,links_crawled_listed
0,"Yes, please.\nPlease follow the Climate Counci...",,Rapidly reduce sales of ICE vehicles\nRapidly ...,,,"I agree with the current objectives, however s...",The Government should be informing Australians...,"Yes, as shown in the EU etc. Please introduce ...",City,,...,New South Wales,Please do not incentivise hybrids or plug-in h...,,"World-best ambition, as soon as possible.","Make riding a bike safe and accessible, as thi...",I took two kids in a pram on a bus to a street...,,#460\nAnonymous,14,https://consult.dcceew.gov.au/national-electri...
1,,,,,,,"Government. ""Industry"", (the local distributor...",,City,,...,New South Wales,,,,,If you want more EVs in Australia allow free t...,,#456\nAnonymous,21,https://consult.dcceew.gov.au/national-electri...
2,,,,,,,,,City,,...,Victoria,,,,,,,#449\nAnonymous,23,https://consult.dcceew.gov.au/national-electri...
3,Different measures are no doubt needed to supp...,The evidence suggets that they do encourage this.,Setting a target year for banning ICE vehicle ...,Supporting the full lifecyle of battery manufa...,Priotising locally made heavy vehicle and bus ...,"It's unclear whether ""establishing systems and...",Government and industry both have a role to pl...,Indications are that fuel efficiency standards...,City,Fuel excise is not currently utilised for road...,...,New South Wales,"Only BEVs should be incentivised, the rest are...",Absolutely and it is already being done for ma...,Government should consider best practice stand...,Structure incentives appropriately and look at...,Australia is ready for bold and strong commitm...,Fuel efficiency standards would no doubt incre...,#438\nAnonymous,46,https://consult.dcceew.gov.au/national-electri...
4,,Fuel should no longer be relevant in these dis...,"Uptake of EVs, increase of charge station avai...",Support shipping vehicles to the country,,We should not tie ourselves to Australian manu...,A similar body to the AER may be needed to assist,No,City,,...,Victoria,10 year timeframe to allow for infrastructure,,,,People in high density housing have no chance ...,,#432\nAnonymous,59,https://consult.dcceew.gov.au/national-electri...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442,,,,,,,,,Regional area,,...,New South Wales,,,,,,,#409\nWilliam Adlong,4833,https://consult.dcceew.gov.au/national-electri...
443,,,,,,,,,Regional area,,...,New South Wales,,,,,,,#301\nWinZero Inc,4835,https://consult.dcceew.gov.au/national-electri...
444,,,,,,,,,Regional area,,...,New South Wales,,,,,The attachment is in “Pages” (IOS platform) wh...,,#312\nWodonga Albury Towards Climate Health (W...,4838,https://consult.dcceew.gov.au/national-electri...
445,,,,,,,,,City,,...,New South Wales,,,,,,,#436\nWoolworths Group,4840,https://consult.dcceew.gov.au/national-electri...


In [23]:
keep_rows_bool = []
for i in range(len(raw_df)):
    # combine all values of row 0 to one string
    # row_str = ''.join(raw_df.iloc[i,:-3].dropna().astype(str)).replace("\n","")
    keep_rows_bool.append(raw_df.iloc[i,:-3].isna().sum() < 19)

In [25]:
raw_df["keep_rows_bool"] = keep_rows_bool
raw_df

Unnamed: 0,"12. Do we need different measures to ensure all segments of the road transport sector are able to reduce emissions, and if so what government and industry measures might well support the uptake of electric bikes, micro-mobility and motorbikes?",8. Would vehicle fuel efficiency standards incentivise global manufacturers to send EVs and lower emission vehicles to Australia?,3. What are suitable indicators to measure if we are on track to achieve our goals and objectives?,15. What actions can governments and industry take to strengthen our competitiveness and innovate across the full lifecycle of the EV value chain?,16. How can we expand our existing domestic heavy vehicle manufacturing and assembly capability?,1. Do you agree with the objectives and do you think they will achieve our proposed goals? Are there other objectives we should consider?,6. What information could help increase demand and is Government or industry best placed to inform Australians about EVs?,7. Are vehicle fuel efficiency standards an effective mechanism to reduce passenger and light commercial fleet emissions?,What area best describes where you live?,18. Are there other proposals that could help drive demand for EVs and provide a revenue source to help fund road infrastructure?,...,5. Over what timeframe should we be incentivising low emission vehicles as we transition to zero emission vehicles?,17. Is it viable to extend Australian domestic manufacturing and assembly capability to other vehicle classes?,"10. What design features should the Government consider in more detail for vehicle fuel efficiency standards, including level of ambition, who they should apply to, commencement date, penalties and enforcement?",20. How can we best make sure all Australians get access to the opportunities and benefits from the transition?,Make a general comment,"9. In addition to vehicle fuel efficiency standards for passenger and light commercial vehicles, would vehicle fuel efficiency standards be an appropriate mechanism to increase the supply of heavy vehicle classes to Australia?",Titles,number_of_q_answered,links_crawled_listed,keep_rows_bool
0,"Yes, please.\nPlease follow the Climate Counci...",,Rapidly reduce sales of ICE vehicles\nRapidly ...,,,"I agree with the current objectives, however s...",The Government should be informing Australians...,"Yes, as shown in the EU etc. Please introduce ...",City,,...,Please do not incentivise hybrids or plug-in h...,,"World-best ambition, as soon as possible.","Make riding a bike safe and accessible, as thi...",I took two kids in a pram on a bus to a street...,,#460\nAnonymous,14,https://consult.dcceew.gov.au/national-electri...,True
1,,,,,,,"Government. ""Industry"", (the local distributor...",,City,,...,,,,,If you want more EVs in Australia allow free t...,,#456\nAnonymous,21,https://consult.dcceew.gov.au/national-electri...,True
2,,,,,,,,,City,,...,,,,,,,#449\nAnonymous,23,https://consult.dcceew.gov.au/national-electri...,False
3,Different measures are no doubt needed to supp...,The evidence suggets that they do encourage this.,Setting a target year for banning ICE vehicle ...,Supporting the full lifecyle of battery manufa...,Priotising locally made heavy vehicle and bus ...,"It's unclear whether ""establishing systems and...",Government and industry both have a role to pl...,Indications are that fuel efficiency standards...,City,Fuel excise is not currently utilised for road...,...,"Only BEVs should be incentivised, the rest are...",Absolutely and it is already being done for ma...,Government should consider best practice stand...,Structure incentives appropriately and look at...,Australia is ready for bold and strong commitm...,Fuel efficiency standards would no doubt incre...,#438\nAnonymous,46,https://consult.dcceew.gov.au/national-electri...,True
4,,Fuel should no longer be relevant in these dis...,"Uptake of EVs, increase of charge station avai...",Support shipping vehicles to the country,,We should not tie ourselves to Australian manu...,A similar body to the AER may be needed to assist,No,City,,...,10 year timeframe to allow for infrastructure,,,,People in high density housing have no chance ...,,#432\nAnonymous,59,https://consult.dcceew.gov.au/national-electri...,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442,,,,,,,,,Regional area,,...,,,,,,,#409\nWilliam Adlong,4833,https://consult.dcceew.gov.au/national-electri...,False
443,,,,,,,,,Regional area,,...,,,,,,,#301\nWinZero Inc,4835,https://consult.dcceew.gov.au/national-electri...,False
444,,,,,,,,,Regional area,,...,,,,,The attachment is in “Pages” (IOS platform) wh...,,#312\nWodonga Albury Towards Climate Health (W...,4838,https://consult.dcceew.gov.au/national-electri...,False
445,,,,,,,,,City,,...,,,,,,,#436\nWoolworths Group,4840,https://consult.dcceew.gov.au/national-electri...,False


In [27]:
raw_df.to_csv("CheckPoint_01_MK_P_01.csv",index=False)

# CheckPoint_01

In [28]:
raw_df = raw_df[raw_df["keep_rows_bool"]]

In [29]:
len(raw_df)

241

In [30]:
raw_df.columns

Index(['12. Do we need different measures to ensure all segments of the road transport sector are able to reduce emissions, and if so what government and industry measures might well support the uptake of electric bikes, micro-mobility and motorbikes?',
       '8. Would vehicle fuel efficiency standards incentivise global manufacturers to send EVs and lower emission vehicles to Australia?',
       '3. What are suitable indicators to measure if we are on track to achieve our goals and objectives?',
       '15. What actions can governments and industry take to strengthen our competitiveness and innovate across the full lifecycle of the EV value chain?',
       '16. How can we expand our existing domestic heavy vehicle manufacturing and assembly capability?',
       '1. Do you agree with the objectives and do you think they will achieve our proposed goals? Are there other objectives we should consider?',
       '6. What information could help increase demand and is Government or industry 

In [31]:
combined_values = []
for i in range(len(raw_df)):
    # combine all values of row 0 to one string
    row_str = ''.join(raw_df.iloc[i,:-4].dropna().astype(str)).replace("\n","")
    combined_values.append(row_str)

In [34]:
len(combined_values)

241

In [35]:
combined_values[4]

"Yes, fuel efficiency standards would send an important market signal to manufacturers. They would need to be introduced fairly quickly to avoid dumping of cheaper, less efficient vehicles before their importation was banned. There might need to be a tax on large inventories of less fuel efficient vehicles.- Increasing the number and proportion of new and second-hand EV registrations overall- The availability and takeup of more electricity-efficient EVs for people who don't need high-performance vehicles.- The proportion of workplaces and commuter carparks which offer EV charging facilities- The amount of otherwise unused electricity from rooftop solar being used instead to charge EVs during daylight hours, and the takeup of bidirectional charging to improve grid capacity during peak periods.Provide TAFE places to train mechanics in the maintenance and repair of EVs, tax incentives for domestic EV vehicle and component manufacturing, and for the repurposing of batteries no longer usabl

In [36]:
raw_df["combined_values"] = combined_values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_df["combined_values"] = combined_values


In [38]:
raw_df

Unnamed: 0,"12. Do we need different measures to ensure all segments of the road transport sector are able to reduce emissions, and if so what government and industry measures might well support the uptake of electric bikes, micro-mobility and motorbikes?",8. Would vehicle fuel efficiency standards incentivise global manufacturers to send EVs and lower emission vehicles to Australia?,3. What are suitable indicators to measure if we are on track to achieve our goals and objectives?,15. What actions can governments and industry take to strengthen our competitiveness and innovate across the full lifecycle of the EV value chain?,16. How can we expand our existing domestic heavy vehicle manufacturing and assembly capability?,1. Do you agree with the objectives and do you think they will achieve our proposed goals? Are there other objectives we should consider?,6. What information could help increase demand and is Government or industry best placed to inform Australians about EVs?,7. Are vehicle fuel efficiency standards an effective mechanism to reduce passenger and light commercial fleet emissions?,What area best describes where you live?,18. Are there other proposals that could help drive demand for EVs and provide a revenue source to help fund road infrastructure?,...,17. Is it viable to extend Australian domestic manufacturing and assembly capability to other vehicle classes?,"10. What design features should the Government consider in more detail for vehicle fuel efficiency standards, including level of ambition, who they should apply to, commencement date, penalties and enforcement?",20. How can we best make sure all Australians get access to the opportunities and benefits from the transition?,Make a general comment,"9. In addition to vehicle fuel efficiency standards for passenger and light commercial vehicles, would vehicle fuel efficiency standards be an appropriate mechanism to increase the supply of heavy vehicle classes to Australia?",Titles,number_of_q_answered,links_crawled_listed,keep_rows_bool,combined_values
0,"Yes, please.\nPlease follow the Climate Counci...",,Rapidly reduce sales of ICE vehicles\nRapidly ...,,,"I agree with the current objectives, however s...",The Government should be informing Australians...,"Yes, as shown in the EU etc. Please introduce ...",City,,...,,"World-best ambition, as soon as possible.","Make riding a bike safe and accessible, as thi...",I took two kids in a pram on a bus to a street...,,#460\nAnonymous,14,https://consult.dcceew.gov.au/national-electri...,True,"Yes, please.Please follow the Climate Council ..."
1,,,,,,,"Government. ""Industry"", (the local distributor...",,City,,...,,,,If you want more EVs in Australia allow free t...,,#456\nAnonymous,21,https://consult.dcceew.gov.au/national-electri...,True,"Government. ""Industry"", (the local distributor..."
3,Different measures are no doubt needed to supp...,The evidence suggets that they do encourage this.,Setting a target year for banning ICE vehicle ...,Supporting the full lifecyle of battery manufa...,Priotising locally made heavy vehicle and bus ...,"It's unclear whether ""establishing systems and...",Government and industry both have a role to pl...,Indications are that fuel efficiency standards...,City,Fuel excise is not currently utilised for road...,...,Absolutely and it is already being done for ma...,Government should consider best practice stand...,Structure incentives appropriately and look at...,Australia is ready for bold and strong commitm...,Fuel efficiency standards would no doubt incre...,#438\nAnonymous,46,https://consult.dcceew.gov.au/national-electri...,True,Different measures are no doubt needed to supp...
4,,Fuel should no longer be relevant in these dis...,"Uptake of EVs, increase of charge station avai...",Support shipping vehicles to the country,,We should not tie ourselves to Australian manu...,A similar body to the AER may be needed to assist,No,City,,...,,,,People in high density housing have no chance ...,,#432\nAnonymous,59,https://consult.dcceew.gov.au/national-electri...,True,Fuel should no longer be relevant in these dis...
5,,"Yes, fuel efficiency standards would send an i...",- Increasing the number and proportion of new ...,Provide TAFE places to train mechanics in the ...,,The objectives should be broadened to include ...,There is a role for both government and indust...,"Yes, they are important in reducing emissions,...",City,Distance-based road user charges for all vehic...,...,,Adoption of European standards would be approp...,,Thank you for the opportunity to make a submis...,Fuel efficiency standards for heavy vehicles a...,#352\nAnonymous,77,https://consult.dcceew.gov.au/national-electri...,True,"Yes, fuel efficiency standards would send an i..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,See answers supplied by email and in the attac...,See answers supplied by email and in the attac...,See answers supplied by email and in the attac...,See answers supplied by email and in the attac...,See answers supplied by email and in the attac...,See answers supplied by email and in the attac...,See answers supplied by email and in the attac...,See answers supplied by email and in the attac...,City,See answers supplied by email and in the attac...,...,See answers supplied by email and in the attac...,See answers supplied by email and in the attac...,See answers supplied by email and in the attac...,,See answers supplied by email and in the attac...,#466\nTim O'Loughlin,4734,https://consult.dcceew.gov.au/national-electri...,True,See answers supplied by email and in the attac...
433,,,Please see my response,,,Please see my response,,,City,,...,,,,,,#435\nVolgren Australia,4765,https://consult.dcceew.gov.au/national-electri...,True,Please see my responsePlease see my responseCi...
436,It is important to focus on the main polluters...,yes,percentage of EVs sold in comparison to ICE ve...,,,Yes and yes.\nIncreasing fuel independence sho...,Government to provide statistics on EV uptake ...,yes,City,The earlier proposed RUC policy is ideally sui...,...,,,,The comments made in the submission may be rep...,,#113\nWatts4U,4786,https://consult.dcceew.gov.au/national-electri...,True,It is important to focus on the main polluters...
439,Please refer to the attached document for our ...,Please refer to the attached document for our ...,Please refer to the attached document for our ...,Please refer to the attached document for our ...,Please refer to the attached document for our ...,Please refer to the attached document for our ...,Please refer to the attached document for our ...,Please refer to the attached document for our ...,City,Please refer to the attached document for our ...,...,Please refer to the attached document for our ...,Please refer to the attached document for our ...,Please refer to the attached document for our ...,,Please refer to the attached document for our ...,#455\nWestern Sydney Regional Organisation of ...,4812,https://consult.dcceew.gov.au/national-electri...,True,Please refer to the attached document for our ...


In [39]:
raw_df.to_csv("CheckPoint_02_MK_P_01.csv",index=False)

# Checkpoint_02
## step 1 of cleaning

In [40]:
combined_df = pd.read_csv("CheckPoint_02_MK_P_01.csv")
combined_df

Unnamed: 0,"12. Do we need different measures to ensure all segments of the road transport sector are able to reduce emissions, and if so what government and industry measures might well support the uptake of electric bikes, micro-mobility and motorbikes?",8. Would vehicle fuel efficiency standards incentivise global manufacturers to send EVs and lower emission vehicles to Australia?,3. What are suitable indicators to measure if we are on track to achieve our goals and objectives?,15. What actions can governments and industry take to strengthen our competitiveness and innovate across the full lifecycle of the EV value chain?,16. How can we expand our existing domestic heavy vehicle manufacturing and assembly capability?,1. Do you agree with the objectives and do you think they will achieve our proposed goals? Are there other objectives we should consider?,6. What information could help increase demand and is Government or industry best placed to inform Australians about EVs?,7. Are vehicle fuel efficiency standards an effective mechanism to reduce passenger and light commercial fleet emissions?,What area best describes where you live?,18. Are there other proposals that could help drive demand for EVs and provide a revenue source to help fund road infrastructure?,...,17. Is it viable to extend Australian domestic manufacturing and assembly capability to other vehicle classes?,"10. What design features should the Government consider in more detail for vehicle fuel efficiency standards, including level of ambition, who they should apply to, commencement date, penalties and enforcement?",20. How can we best make sure all Australians get access to the opportunities and benefits from the transition?,Make a general comment,"9. In addition to vehicle fuel efficiency standards for passenger and light commercial vehicles, would vehicle fuel efficiency standards be an appropriate mechanism to increase the supply of heavy vehicle classes to Australia?",Titles,number_of_q_answered,links_crawled_listed,keep_rows_bool,combined_values
0,"Yes, please.\nPlease follow the Climate Counci...",,Rapidly reduce sales of ICE vehicles\nRapidly ...,,,"I agree with the current objectives, however s...",The Government should be informing Australians...,"Yes, as shown in the EU etc. Please introduce ...",City,,...,,"World-best ambition, as soon as possible.","Make riding a bike safe and accessible, as thi...",I took two kids in a pram on a bus to a street...,,#460\nAnonymous,14,https://consult.dcceew.gov.au/national-electri...,True,"Yes, please.Please follow the Climate Council ..."
1,,,,,,,"Government. ""Industry"", (the local distributor...",,City,,...,,,,If you want more EVs in Australia allow free t...,,#456\nAnonymous,21,https://consult.dcceew.gov.au/national-electri...,True,"Government. ""Industry"", (the local distributor..."
2,Different measures are no doubt needed to supp...,The evidence suggets that they do encourage this.,Setting a target year for banning ICE vehicle ...,Supporting the full lifecyle of battery manufa...,Priotising locally made heavy vehicle and bus ...,"It's unclear whether ""establishing systems and...",Government and industry both have a role to pl...,Indications are that fuel efficiency standards...,City,Fuel excise is not currently utilised for road...,...,Absolutely and it is already being done for ma...,Government should consider best practice stand...,Structure incentives appropriately and look at...,Australia is ready for bold and strong commitm...,Fuel efficiency standards would no doubt incre...,#438\nAnonymous,46,https://consult.dcceew.gov.au/national-electri...,True,Different measures are no doubt needed to supp...
3,,Fuel should no longer be relevant in these dis...,"Uptake of EVs, increase of charge station avai...",Support shipping vehicles to the country,,We should not tie ourselves to Australian manu...,A similar body to the AER may be needed to assist,No,City,,...,,,,People in high density housing have no chance ...,,#432\nAnonymous,59,https://consult.dcceew.gov.au/national-electri...,True,Fuel should no longer be relevant in these dis...
4,,"Yes, fuel efficiency standards would send an i...",- Increasing the number and proportion of new ...,Provide TAFE places to train mechanics in the ...,,The objectives should be broadened to include ...,There is a role for both government and indust...,"Yes, they are important in reducing emissions,...",City,Distance-based road user charges for all vehic...,...,,Adoption of European standards would be approp...,,Thank you for the opportunity to make a submis...,Fuel efficiency standards for heavy vehicles a...,#352\nAnonymous,77,https://consult.dcceew.gov.au/national-electri...,True,"Yes, fuel efficiency standards would send an i..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,See answers supplied by email and in the attac...,See answers supplied by email and in the attac...,See answers supplied by email and in the attac...,See answers supplied by email and in the attac...,See answers supplied by email and in the attac...,See answers supplied by email and in the attac...,See answers supplied by email and in the attac...,See answers supplied by email and in the attac...,City,See answers supplied by email and in the attac...,...,See answers supplied by email and in the attac...,See answers supplied by email and in the attac...,See answers supplied by email and in the attac...,,See answers supplied by email and in the attac...,#466\nTim O'Loughlin,4734,https://consult.dcceew.gov.au/national-electri...,True,See answers supplied by email and in the attac...
237,,,Please see my response,,,Please see my response,,,City,,...,,,,,,#435\nVolgren Australia,4765,https://consult.dcceew.gov.au/national-electri...,True,Please see my responsePlease see my responseCi...
238,It is important to focus on the main polluters...,yes,percentage of EVs sold in comparison to ICE ve...,,,Yes and yes.\nIncreasing fuel independence sho...,Government to provide statistics on EV uptake ...,yes,City,The earlier proposed RUC policy is ideally sui...,...,,,,The comments made in the submission may be rep...,,#113\nWatts4U,4786,https://consult.dcceew.gov.au/national-electri...,True,It is important to focus on the main polluters...
239,Please refer to the attached document for our ...,Please refer to the attached document for our ...,Please refer to the attached document for our ...,Please refer to the attached document for our ...,Please refer to the attached document for our ...,Please refer to the attached document for our ...,Please refer to the attached document for our ...,Please refer to the attached document for our ...,City,Please refer to the attached document for our ...,...,Please refer to the attached document for our ...,Please refer to the attached document for our ...,Please refer to the attached document for our ...,,Please refer to the attached document for our ...,#455\nWestern Sydney Regional Organisation of ...,4812,https://consult.dcceew.gov.au/national-electri...,True,Please refer to the attached document for our ...


In [41]:
M_clean_D = combined_df.loc[:,['combined_values']]

In [42]:
M_clean_D.isna().sum()

combined_values    0
dtype: int64

In [43]:
M_clean_D.isna()

Unnamed: 0,combined_values
0,False
1,False
2,False
3,False
4,False
...,...
236,False
237,False
238,False
239,False


In [44]:
M_clean_D.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 1 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   combined_values  241 non-null    object
dtypes: object(1)
memory usage: 2.0+ KB


In [45]:
M_clean_D.iloc[0,0]

'Yes, please.Please follow the Climate Council policy for Sustainable Transport (noting this is aimed at the states and territories): https://www.climatecouncil.org.au/sustainable-transport-policies-states/Reallocate funding from highways and motorways to active and public transport.Fund safe, separated bike lane networks so people on electric bikes and scooters are not a danger to people walking.If possible, encourage the States and Territories to roll out enforced 30 km/h zones around schools, playgrounds, high streets, and other places where people want to walk or ride bikes.Help Australians live healthier lives by making walking, riding a bike, and taking the bus or train an easy and safe way to get around.Rapidly reduce sales of ICE vehiclesRapidly lower whole-of-sector transport emissions, in line with 1.5 degrees C, and with interim targetsIncrease modal share of bikes, other micromobility devices, buses, and trainsReallocate transport budget to active and public transport proje

In [59]:
cleaned_data_01 = M_clean_D.map(first_step_cleaning)

In [60]:
cleaned_data_01.iloc[0,0]

'yes, please.please follow the climate council policy for sustainable transport (noting this is aimed at the states and territories):   funding from highways and motorways to active and public transport.fund safe, separated bike lane networks so people on electric bikes and scooters are not a danger to people walking.if possible, encourage the states and territories to roll out enforced 30 km/h zones around schools, playgrounds, high streets, and other places where people want to walk or ride bikes.help australians live healthier lives by making walking, riding a bike, and taking the bus or train an easy and safe way to get around.rapidly reduce sales of ice vehiclesrapidly lower whole-of-sector transport emissions, in line with 1.5 degrees c, and with interim targetsincrease modal share of bikes, other micromobility devices, buses, and trainsreallocate transport budget to active and public transport projects rather than motorways and roads that make it more difficult to walk and ride 

In [54]:
# # for finding the very special characters
# rem_01 = cleaned_data_01.iloc[0,0][-3:-1]
# rem_02 = cleaned_data_01.iloc[0,0][-3:-2]
# rem_03 = cleaned_data_01.iloc[0,0][-167:-166]

In [57]:
# rem_03

'“'

In [61]:
cleaned_data_01.head()

Unnamed: 0,combined_values
0,"yes, please.please follow the climate council ..."
1,"government. industry, (the local distributors ..."
2,different measures are no doubt needed to supp...
3,fuel should no longer be relevant in these dis...
4,"yes, fuel efficiency standards would send an i..."


In [62]:
combined_df['cleaned_combined_text'] = cleaned_data_01['combined_values']

In [63]:
combined_df.iloc[:,-1]

0      yes, please.please follow the climate council ...
1      government. industry, (the local distributors ...
2      different measures are no doubt needed to supp...
3      fuel should no longer be relevant in these dis...
4      yes, fuel efficiency standards would send an i...
                             ...                        
236    see answers supplied by email and in the attac...
237    please see my responseplease see my responseci...
238    it is important to focus on the main polluters...
239    please refer to the attached document for our ...
240    1. provide free registration and accident insu...
Name: cleaned_combined_text, Length: 241, dtype: object

In [64]:
combined_df.to_csv('CheckPoint_03_MK_P_01.csv',index=False)

# Check_Point_3

# Start from here after Semi Cleaning

In [65]:
Loaded_semi_cleaned_data = pd.read_csv('CheckPoint_03_MK_P_01.csv')

In [66]:
spacy.prefer_gpu()

True

In [67]:
# Load the pre-trained spaCy model
# en_core_web_trf
# en_core_web_sm
# nlp = spacy.load('en_core_web_trf')
if spacy.prefer_gpu():
    # load the en_core_web_trf model on GPU
    nlp = spacy.load("en_core_web_trf")

In [69]:
# Loaded_semi_cleaned_data.iloc[0,-1][13:]

In [70]:
nlp

<spacy.lang.en.English at 0x16c07589ad0>

# must be done multiple times with different slices and must be appended to text and labels, but after save and load cause, remember you have to restart kernel each time.

# Run bellow only if max is 6000

In [244]:
len(Loaded_semi_cleaned_data.iloc[:,-1]),Loaded_semi_cleaned_data.iloc[:,-1]

(241,
 0      yes, please.please follow the climate council ...
 1      government. industry, (the local distributors ...
 2      different measures are no doubt needed to supp...
 3      fuel should no longer be relevant in these dis...
 4      yes, fuel efficiency standards would send an i...
                              ...                        
 236    see answers supplied by email and in the attac...
 237    please see my responseplease see my responseci...
 238    it is important to focus on the main polluters...
 239    please refer to the attached document for our ...
 240    1. provide free registration and accident insu...
 Name: cleaned_combined_text, Length: 241, dtype: object)

In [72]:
count = list()

In [73]:
for i in range(len(Loaded_semi_cleaned_data.iloc[:,-1])):
    count.append(len(Loaded_semi_cleaned_data.iloc[i,-1]))

In [238]:
rows_to_do_nlp_later = []
for i in range(len(count)):
    if count[i]>10000:
        rows_to_do_nlp_later.append(i)

In [239]:
rows_to_do_nlp_later

[91, 123, 141, 142, 184, 190, 218, 225, 230]

In [243]:
len(count),count[123]

(241, 61968)

In [74]:
import numpy as np

In [75]:
np.array(count).max()

61968

In [245]:
text_len_df = 0
label_len_df = 0

In [246]:
for i in range(len(Loaded_semi_cleaned_data.iloc[:,-1])):
    if i in rows_to_do_nlp_later:
        continue
    else:
        One_string_C_post_contents = Loaded_semi_cleaned_data.iloc[i,-1]
        print(len(One_string_C_post_contents))
        if len(One_string_C_post_contents) > 11000:
            print("length > 10000")
            break
        doc = nlp(One_string_C_post_contents)
        text = []
        labels = []
        for ent in doc.ents:
            text.append(ent.text)
            labels.append(ent.label_)
        with open('text_NER.txt', 'a') as f:
            for item in text:
                f.write(item + '\n')
        with open('labels_NER.txt', 'a') as f:
            for item in labels:
                f.write(item + '\n')
        text_len_df += len(text)
        label_len_df += len(labels)
        if text_len_df != label_len_df:
            print("lens not matched!!!!!!!!")
        print("index",i)
        print('text_len_df:  ',text_len_df)
        print('index:  ',label_len_df)

5546
index 0
text_len_df:   33
index:   33
2350
index 1
text_len_df:   56
index:   56
8059
index 2
text_len_df:   91
index:   91
950
index 3
text_len_df:   95
index:   95
4845
index 4
text_len_df:   105
index:   105
1247
index 5
text_len_df:   107
index:   107
2938
index 6
text_len_df:   125
index:   125
5580
index 7
text_len_df:   152
index:   152
2226
index 8
text_len_df:   158
index:   158
3587
index 9
text_len_df:   178
index:   178
3137
index 10
text_len_df:   194
index:   194
3695
index 11
text_len_df:   207
index:   207
1433
index 12
text_len_df:   217
index:   217
1448
index 13
text_len_df:   222
index:   222
1025
index 14
text_len_df:   227
index:   227
1804
index 15
text_len_df:   243
index:   243
3110
index 16
text_len_df:   253
index:   253
1829
index 17
text_len_df:   257
index:   257
2479
index 18
text_len_df:   275
index:   275
1599
index 19
text_len_df:   277
index:   277
2809
index 20
text_len_df:   292
index:   292
7181
index 21
text_len_df:   311
index:   311
4038
in

In [247]:
rows_to_do_nlp_later

[91, 123, 141, 142, 184, 190, 218, 225, 230]

In [361]:
One_string_C_post_contents = Loaded_semi_cleaned_data.iloc[230,-1][:]

In [362]:
#No mor than 38000
len(One_string_C_post_contents)

21294

# Carefull !!!!!!!!!!!!!!!!!!

In [363]:
doc = nlp(One_string_C_post_contents)

# Max alittle

In [364]:
text = []
labels = []

In [365]:
for ent in doc.ents:
    text.append(ent.text)
    labels.append(ent.label_)

In [366]:
with open('text_NER.txt', 'a') as f:
    for item in text:
        f.write(item + '\n')

In [367]:
with open('labels_NER.txt', 'a') as f:
    for item in labels:
        f.write(item + '\n')

# END of repeating

In [368]:
with open('text_NER.txt', 'r') as f:
    my_list = f.readlines()
text = [item.replace("\n","") for item in my_list]

In [369]:
len(text)

4857

In [370]:
type(text)

list

In [371]:
with open('labels_NER.txt', 'r') as f:
    my_list = f.readlines()
labels = [item.replace("\n","") for item in my_list]

In [372]:
len(labels)

4857

In [373]:
type(labels)

list

# max alittle

In [374]:
df = pd.DataFrame({"text":text,"labels":labels})

In [375]:
df

Unnamed: 0,text,labels
0,climate council,ORG
1,30 km,QUANTITY
2,australians,NORP
3,1.5 degrees c,QUANTITY
4,sydney,GPE
...,...,...
4852,tesla owners club,ORG
4853,march 2016,DATE
4854,nearly 2000,CARDINAL
4855,100%,PERCENT


In [376]:
df.to_csv("NER_CheckPoint_MK_P_01.csv",index=False)

# END of NER

# EDA With NER dataset

## filter out the company of person because it is a bias and it apears every where :(

In [None]:
Ner_df = pd.read_csv("Hamed_NER_final.csv")
Ner_df

Unnamed: 0,text,labels
0,Tammy Duckworth,PERSON
1,months,DATE
2,one,CARDINAL
3,the United States Senate,ORG
4,Tuberville,PERSON
...,...,...
4313,Tammy Duckworth,PERSON
4314,Tammy Duckworth,PERSON
4315,the Hispanic Housing Development Corporation,ORG
4316,Hipolito (Paul) Roldn,PERSON


# Done

# END Of Mohammad NER :|

# Step 2 of cleaning