In [41]:
import os
import pandas as pd

def combine_csv_files(directory):
    # Get a list of all CSV files in the directory
    files = [file for file in os.listdir(directory) if file.endswith('.csv')]
    
    # Initialize an empty DataFrame
    combined_df = pd.DataFrame()
    
    # Iterate over each file and append to the combined DataFrame
    for file in files:
        file_path = os.path.join(directory, file)
        df = pd.read_csv(file_path)
        combined_df = pd.concat([combined_df, df], ignore_index=True)
    
    return combined_df

# Example usage:
directory_path = 'db'  # Replace with your directory path
combined_df = combine_csv_files(directory_path)

# Now combined_df contains the concatenated DataFrame of all CSV files in the directory


In [42]:
combined_df

Unnamed: 0,comment,label
0,cristiana ronaldo fans cr7,I
1,old but gold for us,F
2,yes we are present,I
3,14 /08/ 2024,I
4,thanks sir,F
...,...,...
401363,10 million,I
401364,how many times have you gone to beautify parlo...,I
401365,hello man,I
401366,hello shivani ma'amma'am it's my humble reques...,F


In [43]:
combined_df = combined_df.dropna()

In [44]:
combined_df['label'].unique()

array(['I', 'F', 'D'], dtype=object)

In [45]:
combined_df = combined_df[combined_df['label'].isin(['D', 'F', 'I'])]

In [46]:
combined_df['label'].value_counts()

label
F    185221
I    124255
D     83565
Name: count, dtype: int64

In [47]:
def process(x):
    x = ''.join([l for l in x if (l.isalpha() or l==' ')])
    x = x.strip(' ')
    return x

In [48]:
combined_df['comment'] = list(map(process, combined_df['comment']))

In [49]:
combined_df = combined_df[combined_df['comment']!='']

In [50]:
combined_df = combined_df.dropna()

In [51]:
combined_df = combined_df.drop_duplicates()

In [52]:
combined_df

Unnamed: 0,comment,label
0,cristiana ronaldo fans cr,I
1,old but gold for us,F
2,yes we are present,I
4,thanks sir,F
5,any body after years more from august,I
...,...,...
401359,mam ji can you help me please,F
401360,thank you so muchshivaani maam is just awesome,F
401361,long methods apply,D
401364,how many times have you gone to beautify parlo...,I


In [53]:
combined_df['label'].value_counts()

label
F    136629
I     89586
D     79139
Name: count, dtype: int64

In [54]:
combined_df['label'] = combined_df['label'].map({'F': 'feedbak', 'D': 'doubt', 'I': 'irrelevant'})

In [55]:
combined_df

Unnamed: 0,comment,label
0,cristiana ronaldo fans cr,irrelevant
1,old but gold for us,feedbak
2,yes we are present,irrelevant
4,thanks sir,feedbak
5,any body after years more from august,irrelevant
...,...,...
401359,mam ji can you help me please,feedbak
401360,thank you so muchshivaani maam is just awesome,feedbak
401361,long methods apply,doubt
401364,how many times have you gone to beautify parlo...,irrelevant


In [56]:
combined_df.to_csv('dataset.csv', index=False)

In [58]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# Step 1: Load your CSV file into a Dataset object
dataset = Dataset.from_csv('dataset.csv')  # Replace with your CSV file path

# Step 2: Split the dataset into train and test sets (80% train, 20% test)
train_test = dataset.train_test_split(test_size=0.2)

# Step 3: Further split the test set into validation and test sets (50% validation, 50% test from the 20% split)
test_valid = train_test['test'].train_test_split(test_size=0.5)

# Step 4: Combine the splits into a DatasetDict
dataset_dict = DatasetDict({
    'train': train_test['train'],
    'validation': test_valid['train'],
    'test': test_valid['test']
})

Generating train split: 0 examples [00:00, ? examples/s]

In [59]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['comment', 'label'],
        num_rows: 244283
    })
    validation: Dataset({
        features: ['comment', 'label'],
        num_rows: 30535
    })
    test: Dataset({
        features: ['comment', 'label'],
        num_rows: 30536
    })
})

In [62]:
from huggingface_hub import login

In [63]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [64]:
dataset_dict.push_to_hub("hammadali1805/yt-comments-edu-IN")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/245 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/hammadali1805/yt-comments-edu-IN/commit/57417a2a9a68e56a9f3bb57ca8f077187f9c7d8b', commit_message='Upload dataset', commit_description='', oid='57417a2a9a68e56a9f3bb57ca8f077187f9c7d8b', pr_url=None, pr_revision=None, pr_num=None)