In [35]:
# Data Setup

# Choose which columns you want dropped from the model. This is a list!

# Columns I chose to delete:
#    comments            too many null values in the column
#    photo_url           not applicable to the experiment it's just a url for the picture
#    collected_date      Dates are dependent on unpredictable events happening in the world. We will get rid of this for now
#    created_at          same as above
#    updates_author      This is a name. Not useful for prediction
dropcols = ['comments', 'photo_url', 'collected_date', 'created_at', 'updates_author', 'Column1']

# Choose your data set and get the data imported
#df = import_data('updates', dropcols)

# Run your unistats summary
uniStats(df)

# Because none of these files are numeric types (not counting id's), we will not adjust for skewness with any of these files

Unnamed: 0,DataType,Count,Null Count,Unique Values,Mode,Mean,STD,Min,25%,Median,75%,Max,Skew,Kurt
updates_text,object,37368,0,1712,Custom design more to come.,-,-,-,-,-,-,-,-,-
updates_author_type,object,37368,0,3,ORGANIZER,-,-,-,-,-,-,-,-,-
update_id,int64,37368,0,1722,24145104,2.41396e+07,28852.2,23456838,2.41369e+07,2.41444e+07,2.41525e+07,24160136,-7.67312,98.6246
campaign_id,int64,37368,0,736,46073696,4.62315e+07,296268,17242942,4.61067e+07,4.62051e+07,4.63514e+07,47012554,-33.7156,2955.22


In [36]:
# Let's change our data types
df['update_id'] = df['update_id'].astype(int)
df['updates_text'] = df['campaign_id'].astype(str)
df['campaign_id'] = df['campaign_id'].astype(int)
df['updates_author_type'] = df['updates_author_type'].astype(str)

In [37]:
# Now, we need to take out the line breaks
for row in df['updates_text']:
    df['updates_text'].replace('\n', '')

In [31]:
# Function to import your data. This will be called throughout the worksheet
def import_data(data,drop):
    # Import packages
    import pandas as pd
    
    data = data
    
    df = pd.read_csv('C:/Analytics/Data/' + data + '.csv')
    
    # Clean updates_author_type row. We will fill this with "Organizer" as that is the mode value
    df.updates_author_type.fillna('ORGANIZER', inplace=True)
    
    # Remove rows that don't involve 2020
    # Remove rows that don't involve 2020
    year = df.created_at.str[:4]
    df['ld_year'] = year 
    df.drop(df[df['ld_year'] != '2020'].index, inplace=True)
    df.drop(columns=['ld_year'], inplace=True)
            
    # Remove rows where the author's name has corona
    # NOTE: I am having a hard time iterrating over the text to check if that is also there. However, there are only 108 in this file,
    #       So I will just delete the 108 rows. That's a pretty insignificant number of rows
    for row in df.iterrows():
        if ('corona' in row[1][6] or 'Corona' in row[1][6]):
            #if not 'coronaI' in row[1][8] or 'Corona' in row[1][8]:
            #    corona_count +=1
            df.drop([row[0]], inplace=True)
    
    for i in drop:
        df.drop(columns=[i], inplace=True)
        
    # This will drop the 198 null rows in the updates_text file
    df.dropna(axis = 0, inplace=True)
        
    return df

In [5]:
def uniStats(df):
    # Import packages
    import pandas as pd
    pd.set_option('display.max_rows', 100)
    pd.set_option('display.max_columns', 100)

    # Build the data frame
    new_df = pd.DataFrame(columns = ['DataType', 'Count', 'Null Count', 'Unique Values', 'Mode', 
                                     'Mean', 'STD', 'Min', '25%', 'Median', '75%', 'Max', 'Skew', 'Kurt'])

    # Build the data frame:
    for col in df:
        # Build data frame for numeric stats:
        if pd.api.types.is_numeric_dtype(df[col]):
            new_df.loc[col] = [df[col].dtype, df[col].count(), df[col].isnull().sum(), df[col].nunique(), 
                               df[col].mode().values[0], df[col].mean(), df[col].std(), df[col].min(), 
                               df[col].quantile(.25), df[col].median(), df[col].quantile(.75), df[col].max(), 
                               df[col].skew(), df[col].kurt()]
        # Build data frame for categorical stats:
        else:
            new_df.loc[col] = [df[col].dtype, df[col].count(), df[col].isnull().sum(), df[col].nunique(), 
                               df[col].mode().values[0], '-', '-', '-', '-', '-', '-', '-', '-', '-']

    return new_df.sort_values(by=['DataType', 'Skew', 'Unique Values'], ascending=False)

In [38]:
# Export to a CSV
df.to_csv(r'C:\Analytics\data\updates_clean.csv', index = False)

In [27]:
df.head(100)

Unnamed: 0,update_id,campaign_id,updates_author_type,updates_text
202,23456838,43814064,ORGANIZER,Since I last updated the wooden fence between ...
204,23496568,44507882,ORGANIZER,#TodossomosMiguelito
205,23501090,44507882,ORGANIZER,Numero de Cuenta en Pesos
206,23506548,42707740,ORGANIZER,We still need to raise $7000 dollars for healt...
209,23545636,44507882,ORGANIZER,"We thank you All for your prayers, donations a..."
210,23639824,44110880,ORGANIZER,Our Young Mathematicians Busy at Work!!
211,23641718,44967662,ORGANIZER,Please we need your help
215,23657124,44998668,TEAM_MEMBER,These are our orders and confirmations.
217,23661206,44996060,ORGANIZER,"Dear friends,\n\nWe have located few places (o..."
218,23669512,44996060,ORGANIZER,Today we have purchased 1304 pcs of protective...
