In [27]:
#import libraries
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import numpy as np
import pandas as pd   
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV

<b>Data Merging</b>

In [28]:
#load data
df_ableton = pd.read_csv('ableton_data_18nov2022.csv', low_memory=False)
df_flstudio = pd.read_csv('flstudio_data_18nov2022.csv', low_memory=False)

#Combined Data and assign 1 = ableton and 0 = flstudio to differentiate the source
df_ableton["label"] = 1
df_ableton = df_ableton[['label','title','selftext']]
df_flstudio["label"] = 0
df_flstudio = df_flstudio[['label','title','selftext']]
df_main = pd.concat([df_ableton, df_flstudio])

In [29]:
#Obtained a raw data of 8,958 from ableton and 8,738 from flstudio
df_main.groupby(['label']).describe()

Unnamed: 0_level_0,title,title,title,title,selftext,selftext,selftext,selftext
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,8738,8633,Help,9,8738,8738,Curious to see how long users have had with th...,1
1,8958,8912,What did you make in Live this week? / Feedbac...,5,8958,8958,"Hi everyone, the first thing is that I'm Spani...",1


<b>Defining Functions for Cleaning</b>

In [77]:
#define function to clean text

df_cleaning_text = pd.read_csv('cleaning_text.csv', low_memory=False)

def cleaning_text(text):
    for i in range(df_cleaning_text.shape[0]):
        text = text.replace(str(df_cleaning_text.word[i]),str(df_cleaning_text.replacement[i]))
        
        text = re.sub(r'http\S+', '', text) #remove URL
    
    if text == '[removed]': #replace [removed] as blank
        text = np.nan
        
    if text == 'nan':
        text = np.nan
        
    return text

In [78]:
#define function to clean spelling

df_cleaning_word = pd.read_csv('cleaning_word.csv', low_memory=False)

def cleaning_word(text):
    for i in range(df_cleaning_word.shape[0]):
              
        text = text.replace(" " + str(df_cleaning_word.word[i]) + " ",
                            " " + str(df_cleaning_word.replacement[i]) + " ")
          
    if text == '[removed]': #replace [removed] as blank
        text = np.nan
        
    if text == 'nan':
        text = np.nan
        
    return text

In [79]:
#define function to combine terms

df_cleaning_terms = pd.read_csv('cleaning_terms.csv', low_memory=False)

def cleaning_terms(text):
    for i in range(df_cleaning_terms.shape[0]):
              
        text = text.replace(" " + str(df_cleaning_terms.word[i]) + " ",
                            " " + str(df_cleaning_terms.replacement[i]) + " ")
          
    if text == '[removed]': #replace [removed] as blank
        text = np.nan
        
    if text == 'nan':
        text = np.nan
        
    return text

In [80]:
#define function for lemmazization

def lemmatize_text(text):  
    
    lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'\w+')
    
    try:
        text = ' '.join([lemmatizer.lemmatize(w) for w in tokenizer.tokenize(text)])
    
    except:
        print('error')
        
    return text

In [81]:
#checking on stop words
stop = set(stopwords.words('english'))
print(stop)

{'being', 'don', 'myself', "that'll", 'a', 'we', 'how', 'do', "don't", "won't", 'above', 'she', "you'll", "wouldn't", 'our', 'these', 'they', 'weren', 'down', 'needn', 'with', 'hers', 'been', 'were', 'such', "weren't", 'while', 'all', 'aren', 'an', 'through', 'when', 'those', 'then', 'own', "doesn't", 'this', "should've", 'me', 'further', 'did', 'just', 'under', 'at', 'his', "hadn't", "mightn't", 'of', 'shouldn', "you've", 'am', 'itself', 'hasn', "needn't", "aren't", "shan't", 'won', 'its', 'herself', 'didn', 'because', 'are', 'doesn', 'during', 'y', 'should', 'be', 'yourself', 'on', "couldn't", "isn't", 'haven', 'between', 'what', 'from', 'couldn', 'why', 'isn', 'or', 'most', 'until', "you'd", 'too', 'once', 'you', 'as', 'about', 'o', 'the', 'before', 'again', 'against', 'their', 'no', "haven't", 'had', 'now', 't', 've', 'very', 'yourselves', "hasn't", 'd', 'not', 'over', 'was', 'for', 're', 'himself', 'few', 'mustn', 'themselves', 'wouldn', 'out', 'wasn', 'them', 'both', 'some', 'ma'

In [82]:
#include new stop words
new_stops = {"i'm", "we've", "hi", "doe", "doesn't"}
stop.update(new_stops)

In [83]:
#define function to remove stop words

def remove_stop(text):
    tokenizer = RegexpTokenizer(r'\w+')
    word_tokens = tokenizer.tokenize(text)
    for i in word_tokens:
            no_stop = ' '.join([w for w in word_tokens if not w.lower() in stop])          
    try:       
        return no_stop
    except:
        return np.nan #there are title with a blank characters (" ")

<b>Apply Cleaning</b>

In [84]:
#iterative process for continuous cleaning data (so as not to rerun past edits)
df_main = pd.read_csv('clean_data.csv', low_memory=False)

In [85]:
#initate new column 'cleantext' and cleantitle
df_main['cleantext'] = df_main['selftext'].apply(cleaning_text)
df_main['cleantitle'] = df_main['title'].apply(cleaning_text)

In [86]:
#convert to string
df_main["selftext"] = df_main["selftext"].astype('string')
df_main["title"] = df_main["title"].astype('string')

In [87]:
#remove duplicates
df_main = df_main.drop_duplicates(subset='selftext', keep="first")
df_main = df_main.drop_duplicates(subset='title', keep="first")

In [88]:
#set text to lowercase
df_main['cleantext'] = df_main['cleantext'].str.lower()
df_main['cleantitle'] = df_main['cleantitle'].str.lower()

In [89]:
#drop text with less than 20 words
df_main['wordcount'] = [len(re.findall(r'\w+', str(i))) for i in df_main['cleantext']]
df_main = df_main.drop(df_main[df_main.wordcount < 20].index)

In [90]:
#customised processing of cleaning words e.g. spelling
df_main['cleantext'] = df_main['cleantext'].apply(cleaning_word)
df_main['cleantitle'] = df_main['cleantitle'].apply(cleaning_word)

In [91]:
#customised processing of combining terms
df_main['cleantext'] = df_main['cleantext'].apply(cleaning_terms)
df_main['cleantitle'] = df_main['cleantitle'].apply(cleaning_terms)

In [92]:
#lemmatization
df_main['cleantext'] = df_main['cleantext'].apply(lemmatize_text)
df_main['cleantitle'] = df_main['cleantitle'].apply(lemmatize_text)

In [93]:
#remove stop words
df_main["cleantext"] = df_main.cleantext.apply(remove_stop)
df_main["cleantitle"] = df_main.cleantitle.apply(remove_stop)

In [94]:
#summary of clean data
df_main.groupby(['label']).describe()

Unnamed: 0_level_0,wordcount,wordcount,wordcount,wordcount,wordcount,wordcount,wordcount,wordcount
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,8272.0,78.505319,87.612766,20.0,38.0,59.0,92.0,3749.0
1,8635.0,94.516503,93.229845,20.0,46.0,73.0,114.0,3868.0


In [95]:
df_main["cleantext"] = df_main["cleantext"].astype('string')
df_main["cleantitle"] = df_main["cleantitle"].astype('string')

In [96]:
df_main["combinedtext"] = df_main["cleantext"].map(str) + " " + df_main["cleantitle"].map(str)

<b>Generate Word Count List</b>

In [97]:
text_list = df_main.combinedtext.values.tolist()
word_list = []
for item in text_list:
    breakdown_words = word_tokenize(item)
    for word in breakdown_words:
        word_list.append(word)

In [98]:
#generate list of unique words
wordset = set(word_list)
unique_list = list(wordset)
len(unique_list)

22759

In [99]:
#generate count for each unique word
unique_count = []
process_count = 0
for x in unique_list:
    count = 0
    process_count += 1
    for y in word_list:
        if x == y:
            count += 1
    unique_count.append(count)
    if(process_count % 5000 == 0):
        print(str(process_count) + " words completed")

5000 words completed
10000 words completed
15000 words completed
20000 words completed


In [100]:
df_word = pd.DataFrame(list(zip(unique_list, unique_count)), columns =['word', 'count'])

In [101]:
#export word list for manual data cleaning referencce
df_word.to_csv(r'word_list.csv', index=False)

<b>Export Data</b>

In [103]:
#export data
df_main.to_csv(r'clean_data.csv', index=False)