In [41]:
#import libraries
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import numpy as np
import pandas as pd   
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV

<b>Data Merging</b>

In [58]:
#load data
df_ableton = pd.read_csv('ableton_data_20nov2022.csv', low_memory=False)
df_flstudio = pd.read_csv('flstudio_data_20nov2022.csv', low_memory=False)

#Combined Data and assign 1 = ableton and 0 = flstudio to differentiate the source
df_ableton["label"] = 1
df_ableton = df_ableton[['label','title','selftext']]
df_flstudio["label"] = 0
df_flstudio = df_flstudio[['label','title','selftext']]
df_main = pd.concat([df_ableton, df_flstudio])

<b>Defining Functions for Cleaning</b>

In [42]:
#define function to clean text

df_cleaning_text = pd.read_csv('cleaning_text.csv', low_memory=False)

def cleaning_text(text):
    
    try:   
        for i in range(df_cleaning_text.shape[0]):
            text = text.replace(str(df_cleaning_text.word[i]),str(df_cleaning_text.replacement[i]))
        
            text = re.sub(r'http\S+', '', text) #remove URL
    
        if text == '[removed]': #replace [removed] as blank
            text = ""
        
        return text

    except:
        return text

In [43]:
#define function to clean spelling
#the cleaning_word file is blank for the submission 
#refer to cleaning_word_archive for  the cumulative list

df_cleaning_word = pd.read_csv('cleaning_word.csv', low_memory=False)

def cleaning_word(text):
    
    try:   
        for i in range(df_cleaning_word.shape[0]):
             
            text = text.replace(" " + str(df_cleaning_word.word[i]) + " ",
                                " " + str(df_cleaning_word.replacement[i]) + " ")
          
        return text
    
    except:
        return text

In [44]:
#define function to combine terms

df_cleaning_terms = pd.read_csv('cleaning_terms.csv', low_memory=False)

def cleaning_terms(text):
    
    try:   
        for i in range(df_cleaning_terms.shape[0]):
              
            text = text.replace(" " + str(df_cleaning_terms.word[i]) + " ",
                                " " + str(df_cleaning_terms.replacement[i]) + " ")
                
        return text
    
    except:
        return text

In [45]:
#define function for lemmazization

def lemmatize_text(text):  
    
    lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'\w+')
    
    try:
        text = ' '.join([lemmatizer.lemmatize(w) for w in tokenizer.tokenize(text)])
    
        return text
    
    except:
        return text

In [71]:
#checking on stop words
stop = set(stopwords.words('english'))
print(stop)

{'with', 'should', 'yours', "you'll", 'doesn', 'further', 'did', 'its', "doesn't", 'by', "wouldn't", 'about', 'both', 'there', 'aren', 'haven', 'again', "mightn't", "mustn't", 'why', 'me', 'then', 'any', 'isn', 'our', 'she', 'same', 'just', 'a', 'weren', 'what', 'themselves', "hadn't", 'is', 'having', 'such', 'from', 'do', 'can', "didn't", 'until', "shouldn't", 'whom', 'because', 'between', "haven't", 'while', 'so', 'are', 'up', 'only', 'they', 'an', 'my', 'was', 'to', 'mustn', 'i', 'ourselves', 'down', 'theirs', 'where', 'himself', 'being', 'don', 'shouldn', "it's", "weren't", "aren't", 'doing', 'but', 'into', 'itself', 'their', 'at', 'wouldn', 'if', 'his', 'of', 'your', 'that', 'the', 'here', 'll', 'you', 'as', 'some', 'than', 'shan', 'those', 'yourselves', 'under', 'other', "you'd", 'couldn', 'hasn', "should've", 'not', 't', 'it', 'who', 'during', "that'll", "hasn't", 'her', 'will', 'most', 'herself', 'when', 'for', 'yourself', 'does', 'now', 'wasn', 'nor', 'these', 'we', "wasn't", 

In [72]:
#include new stop words
new_stops = {"i'm", "we've", "hi", "doe", "doesn't", "0", "was", "wa"}
stop.update(new_stops)

In [49]:
#define function to remove stop words

def remove_stop(text):
    tokenizer = RegexpTokenizer(r'\w+')

    if len(str(text)) != 0:
        word_tokens = tokenizer.tokenize(str(text))
        
        for i in word_tokens:
            no_stop = ' '.join([w for w in word_tokens if not w.lower() in stop])
            
        try:   
            return no_stop
        
        except:
            return text
    else:
        return None

In [66]:
#initate new column 'cleantext' and cleantitle
df_main['cleantext'] = df_main['selftext'].copy()
df_main['cleantitle'] = df_main['title'].copy()

In [67]:
df_main.to_csv(r'clean_data.csv', index=False)

<b>Apply Cleaning</b><br>
This is an iterative process as progressing cleaning is done based on the output from modelling.

In [50]:
#recall cleaning data
df_main = pd.read_csv('clean_data.csv', low_memory=False)

In [80]:
df_main.groupby(['label']).describe()

Unnamed: 0_level_0,wordcount,wordcount,wordcount,wordcount,wordcount,wordcount,wordcount,wordcount
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,9305.0,46.327673,39.176953,20.0,26.0,36.0,53.0,981.0
1,14881.0,54.636718,52.849603,20.0,29.0,41.0,63.0,2001.0


In [52]:
#initate cleaning text
df_main['cleantext'] = df_main['cleantext'].apply(cleaning_text)
df_main['cleantitle'] = df_main['cleantitle'].apply(cleaning_text)

In [53]:
#convert to string
df_main["cleantext"] = df_main["cleantext"].astype('string')
df_main["cleantitle"] = df_main["cleantitle"].astype('string')

In [79]:
#remove duplicates
df_main = df_main.drop_duplicates(subset='selftext', keep="first")
df_main = df_main.drop_duplicates(subset='title', keep="first")

In [55]:
#set text to lowercase
df_main['cleantext'] = df_main['cleantext'].str.lower()
df_main['cleantitle'] = df_main['cleantitle'].str.lower()

In [75]:
#drop text with less than 20 words
df_main['wordcount'] = [len(re.findall(r'\w+', str(i))) for i in df_main['cleantext']]
df_main = df_main.drop(df_main[df_main.wordcount < 20].index)

In [57]:
#customised processing of cleaning words e.g. spelling
df_main['cleantext'] = df_main['cleantext'].apply(cleaning_word)
df_main['cleantitle'] = df_main['cleantitle'].apply(cleaning_word)

In [58]:
#customised processing of combining terms
df_main['cleantext'] = df_main['cleantext'].apply(cleaning_terms)
df_main['cleantitle'] = df_main['cleantitle'].apply(cleaning_terms)

In [59]:
#lemmatization
df_main['cleantext'] = df_main['cleantext'].apply(lemmatize_text)
df_main['cleantitle'] = df_main['cleantitle'].apply(lemmatize_text)

In [73]:
#remove stop words
df_main["cleantext"] = df_main.cleantext.apply(remove_stop)
df_main["cleantitle"] = df_main.cleantitle.apply(remove_stop)

In [81]:
#summary of clean data
df_main.groupby(['label']).describe()

Unnamed: 0_level_0,wordcount,wordcount,wordcount,wordcount,wordcount,wordcount,wordcount,wordcount
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,9305.0,46.327673,39.176953,20.0,26.0,36.0,53.0,981.0
1,14881.0,54.636718,52.849603,20.0,29.0,41.0,63.0,2001.0


In [82]:
#to ensure all data are string before merging
df_main["cleantext"] = df_main["cleantext"].astype('string')
df_main["cleantitle"] = df_main["cleantitle"].astype('string')

In [83]:
df_main["combinedtext"] = df_main["cleantext"].map(str) + " " + df_main["cleantitle"].map(str)

In [91]:
#export data
df_main.to_csv(r'clean_data.csv', index=False)

<b>Generate Word Count List</b>

In [85]:
text_list = df_main.combinedtext.values.tolist()
word_list = []
for item in text_list:
    breakdown_words = word_tokenize(item)
    for word in breakdown_words:
        word_list.append(word)

In [86]:
#generate list of unique words
wordset = set(word_list)
unique_list = list(wordset)
len(unique_list)

29245

In [87]:
#generate count for each unique word
unique_count = []
process_count = 0
print("Processing in Progress......")
for x in unique_list:
    count = 0
    process_count += 1
    for y in word_list:
        if x == y:
            count += 1
    unique_count.append(count)
    if(process_count % 5000 == 0):
        print(str(process_count) + " words completed")
print("Process Completed......")

Processing in Progress......
5000 words completed
10000 words completed
15000 words completed
20000 words completed
25000 words completed
Process Completed......


In [88]:
df_word = pd.DataFrame(list(zip(unique_list, unique_count)), columns =['word', 'count'])

In [89]:
#export word list for manual data cleaning referencce
df_word.to_csv(r'word_list.csv', index=False)

<b>Top Words</b>

In [90]:
#top words
df_word = df_word.sort_values(by=['count'], ascending=False)
df_word.head(20)

Unnamed: 0,word,count
4710,ableton,16653
10227,track,12437
23678,like,11357
25178,sound,10924
2476,midi,10143
4526,audio,10082
26068,use,8681
24672,would,8650
27644,know,8562
29057,get,8081
