## SSC Functions
To reduce redundancy, creating and documenting functions here allows for more modularity in the code,

Here you will find code based on data cleaning and modeling.

In [None]:
def clean_string(string):
    """ cleans string of punctuation and unwanted characters
    
    args:
        string(str): string to clean
        
    returns: 
        temp(str): cleaned string
    """
    # clean the string
    raw_string = str(string)
    if type(string) == float:
        return ""
    temp = re.sub("'", "", str(string)) # to avoid removing contractions in english
    temp = temp.lower()
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub(r"www.\S+", "", temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    #temp = re.sub("[^a-z0-9]"," ", temp) # this removes all letters including CO2
    temp = re.sub("[^a-z]"," ", temp)
    temp = temp.split()
    temp = [w for w in temp if not w in stopwords]
    temp = " ".join(word for word in temp)
    
    if temp is None:
        print (raw_string,temp)
        
    return temp

In [None]:
def res(string_list):
    """ if a list is in string format ('['hello']'), sets it to its actual type
    
    args:
        string_list(str): list as a string
        
    """
    return ast.literal_eval(string_list)

In [None]:
def remove_bad_string(series, bad_string):
    """
    removes a string from a series of lists 
    
    args:
        series(series): a series
        bad_string(str): string to be removed
        
    returns:
        series(series): the series with the removed string
    """
    # Define the remove_re function to remove a specified bad string from all lists
    def remove_string(lst, bad_string):
        """ removes string from list
        
        args:
            lst(list): a list
            bad_string(str): string to be removed
        """
        # return list after applying removal of string
        return [s for s in lst if s != bad_string]
    
    # return the series after applying the removal of the string
    return series.apply(lambda lst: remove_string(lst, bad_string))

In [None]:
def group_by(column_name, dataframe):
    """
    splits a dataframe into a list of dataframes with a given column name 
    
    args:
        column_name(str) : the column name string
        dataframe(df): the dataframe to be split up
        
    returns:
        dataframes(list): a list of dataframes
    
    """
    # create a list of dataframes with list comprehension
    dataframes = [group_data for _, group_data in dataframe.groupby(column_name)]
    
    return dataframes

In [None]:
def run_LDA(words, num_topics, random_state = 100, update_every = 1,
            chunksize = 10, passes = 10, alpha = 'auto',
            per_word_topics = True):
    """
    Perform topic modeling using LDA on a series of words.

    args:
        words (pd.Series): A series of words to have topic modeling imposed on it.
        num_topics (int, optional): The number of topics for topic modeling.
        random_state (int, optional): Random state for reproducibility. Defaults to 100.
        update_every (int, optional): Number of documents to process for each online training iteration. Defaults to 1.
        chunksize (int, optional): Number of documents to load into memory at a time for online training. Defaults to 10.
        passes (int, optional): Number of passes through the corpus during training. Defaults to 10.
        alpha (str or float, optional): The alpha parameter for LDA. 'auto' uses a symmetric distribution. Defaults to 'auto'.
        per_word_topics (bool, optional): Whether to include per-word-topic probabilities. Defaults to True.

    returns:
        vis_data: Data prepared for visualization using pyLDAvis.

    """
    
    # initialize dictionary and corpus with words
    dictionary = corpora.Dictionary(words)
    corpus = [dictionary.doc2bow(word) for word in words]


    # build the LDA model
    lda_model = gensim.models.LdaModel(corpus=corpus,
                                       id2word=dictionary,
                                       num_topics=num_topics,
                                       random_state=random_state,
                                       update_every=update_every,
                                       chunksize=chunksize,
                                       passes=passes,
                                       alpha=alpha,
                                       per_word_topics=per_word_topics)

    # visualize the topics using pyLDAvis
    vis_data = gensimvis.prepare(lda_model, corpus, dictionary)

    # print the top words for each topic
    for topic in lda_model.print_topics():
        print(topic)
        
    return vis_data