In [34]:
import matplotlib.pyplot as plt
import wordcloud as wc
import re
import pandas as pd
import numpy as np
import random

def csv_to_df(filename):
    ''' Function: csv_to_df
        Parameters: filename (string), header (list)
        Returns: dataframe containing information from
                the file and headers from input
    '''

    df = pd.read_csv(filename) #,names = header)
    return df

def okcupid_regex(df):
    ''' Function: okcupid_regex
        Parameters: df (dataframe)
        Returns: dataframe sanitized of HTML tags
    '''
    for col in df:
        if df.dtypes[col] == np.object or df.dtypes[col] == str:
            df[col] = df[col].str.replace(r'<[^>]+>','',regex=True)
            df[col] = df[col].str.replace('\n',' ',regex=True)

    return df

def read_txt(text):
    ''' Function: read_txt
        Parameters: name of speech file (string)
        Returns: the text (string) but with punctuation, 
            square brackets, and weird line breaks removed
    '''

    text = re.sub(r'<[^>]+>','',text)
    text = re.sub('\[.+\]','',text)
    text = re.sub('\<.+\>','',text)
    text = re.sub('[^\w\s]','',text)
    text = re.sub('href','',text) #my attempt at removing added linebreaks (temporary)
    text = re.sub('br','',text) #my attempt at removing added linebreaks (temporary)
    text = re.sub('\n',' ',text) #my attempt at removing added linebreaks (temporary)
    return text


def combine_essay0(df):
    ''' Function: combine_essay0
        Parameters: df (dataframe)
        Returns: list of each user's essay0, cleaned
    '''
    essay = []
    for i in range(len(df)):
        text = df.loc[i,'essay0']
        x = read_txt(text)
        essay.append(x)


    return essay



def generate_lyric(begin, ending, ngrams):
    ''' Function: generate_lyric
        Parameters: list of begin-words, list of end-words,
                    dictionary of key = word, value = list of followed-by
        Returns: one line of a lyric
    '''
    sentence = ""
    curr_word = random.choice(begin)
    while True:
        sentence += " " + curr_word
        if curr_word in ending:
            break
        curr_word = random.choice(ngrams[curr_word])
    return sentence

def get_ngram(text):
    ''' Function: get_ngram
        Parameters: text (list of strings)
        Returns: one line of a lyric
    '''

    first_word_list = []
    last_word_list = []
    lyrics_dic = {}
    for i in range(100): #if it's too big then too many stopping words
        if text[i] == ' ':
            continue
        text[i] = text[i].split()
        first_word_list.append(text[i][0]) #creating list of first words
        last_word_list.append(text[i][-1]) #creating list of last words
        
        #creating a dictionary where key: word, value: list of words that comes after key in a sentence
        for j in range(len(text[i])):
            if j <= len(text[i]) - 2:
                if text[i][j] in lyrics_dic:
                    lyrics_dic[text[i][j]].append(text[i][j+1])
                else:
                    match_list = []
                    match_list.append(text[i][j+1])
                    lyrics_dic[text[i][j]] = match_list
    return first_word_list, last_word_list, lyrics_dic 

In [43]:
if __name__ == "__main__":
    okcupid = 'profiles.csv.zip'
    df = csv_to_df(okcupid)
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    essay = combine_essay0(df)
    first_word_list, last_word_list, lyrics_dic = get_ngram(essay)
    
    sentence = ''
    for i in range(5):
        sentence += generate_lyric(first_word_list,last_word_list,lyrics_dic)
        sentence += '.'
    print(sentence)
    

 i write runon sentences for school. i do. im being a regular basis coping with me. learning new. living it.
