In [129]:

# IMPORT REQUIRED PACKAGES

import pandas as pd
import numpy as np
from IPython.display import display, HTML

#------------------INPUT-------------------#

# INSERT FILENAME
file_name = 'Tweets_Chatgpt_2023.csv'

#------------------------------------------#

def count_special_characters(df):
    special_characters = r'!?-\|:;#@()+-="{}[]*$%^&€£/~`'
    special_char_count = {}
    columns = df.columns

    for column in columns:
        special_char_count[column] = 0
        for value in df[column]:
            if any(char in special_characters for char in str(value)):
                special_char_count[column] += 1

    return [count for _, count in special_char_count.items()]


def count_rows_with_whitespace(df):
    counts = {}
    for column in df.columns:
        count = df[column].apply(lambda value: isinstance(value, str) and r'\s{2,}' in value).sum()
        counts[column] = count
    return counts.values()


def count_rows_with_breaks(df):
    counts = {}
    for column in df.columns:
        count = df[column].apply(lambda value: isinstance(value, str) and '\n' in value).sum()
        counts[column] = count
    return counts.values()


def format_numbers(value):
    if isinstance(value, (float, int)):
        return "{:,}".format(value)
    else:
        return value


# DATASET ANALYZER FUNCTION

def analyze_dataset(file_name):
    # Determine file format based on file extension
    file_extension = file_name.split('.')[-1].lower()

    # Read the data file
    if file_extension == 'csv':
        df = pd.read_csv(file_name)
    elif file_extension == 'xlsx':
        df = pd.read_excel(file_name)
    elif file_extension == 'json':
        df = pd.read_json(file_name)
    else:
        raise ValueError(f"Unsupported file format: {file_extension}. Only CSV, XLSX, and JSON files are supported.")

    # Get dataset information
    num_columns = len(df.columns)
    num_rows = len(df)
    memory_usage = df.memory_usage().sum()

    # Create dataset table
    dataset_info = pd.DataFrame({
        'Total Columns': [num_columns],
        'Total Rows': [num_rows],
        'Memory Usage': [memory_usage]
    })

    # Get column information
    columns = df.columns.tolist() # Column Names
    data_types = df.dtypes.tolist() # Datatypes
    null_counts = df.isnull().sum().tolist() # NULL count
    na_counts = df.isna().sum().tolist() # NA count

    numeric_columns = df.select_dtypes(include=np.number).columns
    mean_values = df[numeric_columns].mean().round(1).tolist() # Average
    max_values = df[numeric_columns].max().round(1).tolist() # Maximum
    min_values = df[numeric_columns].min().round(1).tolist() # Minimum

    unique_counts = df.nunique().tolist() # Unique values count
    duplicate_counts = (df.duplicated(keep=False)).sum() # Duplicate values count

    # Create DataFrame with column information
    column_info = pd.DataFrame({
        'Column Name': columns,
        'Data Type': data_types,
        'Null Count': null_counts,
        'NA Count': na_counts,
        'Unique Count': unique_counts,
        'Duplicate Count': duplicate_counts,
        'Mean': np.nan,
        'Max': np.nan,
        'Min': np.nan
    })

    # Append mean, max, and min for Numeric DataFrame columns
    column_info.loc[column_info['Column Name'].isin(numeric_columns), 'Mean'] = mean_values
    column_info.loc[column_info['Column Name'].isin(numeric_columns), 'Max'] = max_values
    column_info.loc[column_info['Column Name'].isin(numeric_columns), 'Min'] = min_values

    # Count rows with whitespace
    whitespace_count = count_rows_with_whitespace(df)

    # Count rows with special characters
    special_characters_count = count_special_characters(df)

    # Count rows with break lines
    break_line_count = count_rows_with_breaks(df)

    # Append additional columns to the DataFrame
    column_info['Special Characters Count'] = special_characters_count
    column_info['Whitespace Count'] = whitespace_count
    column_info['Break Line Count'] = break_line_count

    # Dataset views
    head_table = df.head()
    tail_table = df.tail()
    sample_table = df.sample(10)

    # Display Report of All Outputs
    print(f"\n DATASET: {file_name}\n")
    
    print('\n DATASET OVERVIEW:')
    display(HTML(dataset_info.to_html(index=False)))

    print('\n DATASET SUMMARY:')
    formatted_info = column_info.applymap(format_numbers)
    display(HTML(formatted_info.to_html(index=True)))
    
    print("\nDATASET HEAD:")
    display(HTML(head_table.to_html(index=False)))
    print("\n\nDATASET TAIL:")
    display(HTML(tail_table.to_html(index=False)))
    print("\n\nDATASET SAMPLE:")
    display(HTML(sample_table.to_html(index=False)))


# EXECUTE DATASET ANALYZER FUNCTION

analyze_dataset(file_name)





 DATASET: Tweets_Chatgpt_2023.csv


 DATASET OVERVIEW:


Total Columns,Total Rows,Memory Usage
6,500036,24001856



 DATASET SUMMARY:


Unnamed: 0,Column Name,Data Type,Null Count,NA Count,Unique Count,Duplicate Count,Mean,Max,Min,Special Characters Count,Whitespace Count,Break Line Count
0,date,object,0,0,475394,0,,,,500036,0,0
1,id,object,6,6,500007,0,,,,0,0,0
2,content,object,6,6,493744,0,,,,479965,0,240030
3,username,object,34,34,250006,0,,,,0,0,0
4,like_count,float64,62,62,1066,0,7.1,64094.0,0.0,0,0,0
5,retweet_count,float64,62,62,489,0,1.5,16080.0,0.0,0,0,0



DATASET HEAD:


date,id,content,username,like_count,retweet_count
2023-03-29 22:58:21+00:00,1641213230730051584,"Free AI marketing and automation tools, strategies, and collaboration launching new week https://t.co/Qwti8LfBpb #ChatGPT",RealProfitPros,0.0,0.0
2023-03-29 22:58:18+00:00,1641213218520481805,@MecoleHardman4 Chat GPT says it’s 15. 😂,AmyLouWho321,0.0,0.0
2023-03-29 22:57:53+00:00,1641213115684536323,"https://t.co/FjJSprt0te - Chat with any PDF!\nCheck out how this new AI quickly answers questions from your PDFs.\nPerfect for students, researchers, and other curious minds. \n#research #chatpdf #ChatGPT",yjleon1976,0.0,0.0
2023-03-29 22:57:52+00:00,1641213110915571715,"AI muses: ""In the court of life, we must all face the judge of destiny and the jury of our actions. ⚖️🔮 #OutOfContextAI #AILifeLessons #ChatGPT",ChatGPT_Thinks,0.0,0.0
2023-03-29 22:57:26+00:00,1641213003260633088,"Most people haven't heard of Chat GPT yet.\nFirst, elite factions will decide which way to go on AI safety. Next they will push their agenda(s) on the public with misleading and oversimplified media presentations. Finally, the brainless Red and Blue camps will screech their lines",nikocosmonaut,0.0,0.0




DATASET TAIL:


date,id,content,username,like_count,retweet_count
2023-01-04 07:18:08+00:00,1610536038094757888,@GoogleAI #LAMDA Versus @OpenAI #ChatGPT ?! Who cares? Lamda isn't available. ChatGPT is. What's my reason to be interested in comparing the two? Lamda is fiction! There's no good evidence it exists! Lamda is as fake as Commander Data! https://t.co/g0SlTAJIs1 via @Marktechpost,Pup_In_Cup,1.0,0.0
2023-01-04 07:17:50+00:00,1610535961670172674,#ChatGPT \n\nSo much #Censorship.\n\nNever trust a system you don't admin. https://t.co/nlcr0FUeDx,TryingToOffend,2.0,0.0
2023-01-04 07:17:20+00:00,1610535837363486720,all my twitter feed is about ChatGPT and @OpenAI lol 😆\n\n#AI #ChatGPT,mcp350,3.0,1.0
2023-01-04 07:17:08+00:00,1610535786017091584,I'm quite amazed by Chat GPT. A really promising person 😄and Q mark for Google,manumurali369,1.0,0.0
2023-01-04 07:16:56+00:00,1610535734758219778,I used chat gpt to get gym workout program and it was so good. Sore af rn!,pnik91,0.0,0.0




DATASET SAMPLE:


date,id,content,username,like_count,retweet_count
2023-02-27 12:22:31+00:00,1630181582341283846,This is amazing (Crazy)!\n\nhttps://t.co/so9GuBQvpr\n#AI #ChatGPT https://t.co/mgwDW3tPGC,carlosospinocom,0.0,0.0
2023-02-08 01:13:00+00:00,1623127725899755521,"Oh, I asked #ChatGPT about title 11, #Oklahoma statutes, and he didn’t know Jack",red_river_post,0.0,0.0
2023-02-07 06:42:04+00:00,1622848151265718272,"Here we look at the potential impact and discuss that despite limitations like frequent mistakes and limited data upload capabilities, #ChatGPT has the potential to #automate data gathering and #analysis tasks in the future.\n\nhttps://t.co/09qFsLGFOy",BernardMarr,10.0,6.0
2023-03-14 20:43:16+00:00,1635743420390965250,"Hold onto your butts everyone, AI changes everything again \n\n#ai #gpt4 #ChatGPT #openai",LewisLovelock,3.0,0.0
2023-03-28 03:08:06+00:00,1640551306539728896,"So here is how debate start lmao. I`ll look at it as 2 different medium and it serve different purpose, replace? Not at the moment I think #chatgpt #chatgpt4 #ai #aiforbusiness #metaverse #blockchain #dao #defi #cefi #ceolife #ceo #founders https://t.co/yT5JNBYLLI",johnchin2832,0.0,0.0
2023-01-12 09:43:46+00:00,1613471791875235840,"@Alexmathers84 Growth hack 101- build a Chat GPT personal brand, put a name and a face to the mysterious overlord",willfaconnelly,2.0,0.0
2023-01-12 02:34:42+00:00,1613363811276435459,It's great to see how AI is helping creators to overcome creative blocks...\n\n#AI #ChatGPT #Canva,sohrabkhawas,4.0,2.0
2023-01-26 21:23:45+00:00,1618721375996481536,ChatGPT is the new Finance Minister of Pakistan.\n#ChatGPT,KyaaBatHai,0.0,0.0
2023-02-11 04:02:03+00:00,1624257431223078912,#1 SIGN UP TO CHAT GPT.\n\nYou're going to create an account.\n\nYou can use your gmail.,GuilhermeWrites,0.0,0.0
2023-01-08 05:21:36+00:00,1611956263843409923,@hasantoxr Chat gpt,KaranNandwani9,0.0,0.0
