In [1]:
import pandas as pd

dataset_dir = "data/train_test_val"

DATASET = {
    'train': pd.read_csv(dataset_dir + '/train.csv').reset_index(drop=True), 
    'test': pd.read_csv(dataset_dir + '/test.csv').reset_index(drop=True), 
    'val': pd.read_csv(dataset_dir + '/val.csv').reset_index(drop=True), 
}

In [2]:
LABELS = [label for label in DATASET['train'].keys() if label not in ['ID', 'Text']]
id2label = {idx:label for idx, label in enumerate(LABELS)}
label2id = {label:idx for idx, label in enumerate(LABELS)}
LABELS

['Murder',
 'Homicide',
 'Robbery',
 'Physical Injuries',
 'Rape',
 'Theft',
 'Carnapping',
 'Others']

In [3]:
label_counts = {label: DATASET['train'][label].sum() for label in LABELS}

for label, count in label_counts.items():
    print(f"{label}: {count}")


Murder: 344
Homicide: 356
Robbery: 411
Physical Injuries: 351
Rape: 385
Theft: 316
Carnapping: 347
Others: 300


### Test label count

In [4]:
label_counts = {label: DATASET['test'][label].sum() for label in LABELS}

for label, count in label_counts.items():
    print(f"{label}: {count}")


Murder: 179
Homicide: 179
Robbery: 209
Physical Injuries: 181
Rape: 200
Theft: 158
Carnapping: 148
Others: 153


In [5]:
train_ids = set(DATASET['train']['Text'])
val_ids = set(DATASET['test']['Text'])
test_ids = set(DATASET['val']['Text'])

train_val_overlap = train_ids.intersection(val_ids)
train_test_overlap = train_ids.intersection(test_ids)
val_test_overlap = val_ids.intersection(test_ids)

print(f"Number of overlapping texts between train and val: {len(train_val_overlap)}")
print(f"Number of overlapping texts between train and test: {len(train_test_overlap)}")
print(f"Number of overlapping texts between val and test: {len(val_test_overlap)}")


print(len(DATASET['train']['Text']))
print(len(DATASET['test']['Text']))
print(len(DATASET['val']['Text']))


Number of overlapping texts between train and val: 134
Number of overlapping texts between train and test: 55
Number of overlapping texts between val and test: 28
2400
1200
400


# Word Frequency

In [14]:
import pandas as pd
import string
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

STOP_WORDS = set(stopwords.words('english'))

def most_common_words(label, dataframe, n=30):
    """
    Given a label and a dataframe, returns the n most common words excluding stop words.

    Parameters:
    - label (str): The label to filter by (e.g., 'Murder', 'Robbery').
    - dataframe (pd.DataFrame): The dataframe containing the text data.
    - n (int): The number of most common words to return.

    Returns:
    - List of tuples with the most common words and their frequencies.
    """
    # Filter the dataframe by the given label where the label is 1
    label_data = dataframe[dataframe[label] == 1]

    # Combine all text entries into one large string
    all_text = ' '.join(label_data['Text'].tolist())

    # Convert text to lowercase
    all_text = all_text.lower()

    # Remove punctuation
    all_text = all_text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text into words
    words = word_tokenize(all_text)

    # Filter out stop words and words with length less than 2
    filtered_words = [word for word in words if word not in STOP_WORDS and len(word) > 1]

    # Count word frequencies
    word_freq = Counter(filtered_words)

    # Return the n most common words
    return word_freq.most_common(n)

for label in LABELS:
    print(f"\nMost common words for label '{label}':")
    common_words = most_common_words(label, DATASET['train'], n=20)
    
    
    for idx, (word, frequency) in enumerate(common_words, start=1):
        print(f" {idx}. {word}: {frequency}")


[nltk_data] Downloading package stopwords to /home/syke/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/syke/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Most common words for label 'Murder':
 1. saw: 155
 2. man: 140
 3. murder: 129
 4. house: 110
 5. shot: 110
 6. like: 106
 7. act: 90
 8. heard: 86
 9. one: 81
 10. calculated: 79
 11. planned: 77
 12. started: 74
 13. way: 74
 14. coldblooded: 73
 15. left: 72
 16. deliberate: 61
 17. guy: 60
 18. clear: 59
 19. found: 58
 20. made: 58

Most common words for label 'Homicide':
 1. man: 269
 2. saw: 172
 3. heard: 135
 4. head: 127
 5. one: 125
 6. hit: 116
 7. started: 109
 8. victim: 103
 9. fell: 83
 10. woman: 81
 11. trying: 75
 12. unintentional: 72
 13. tried: 71
 14. pulled: 70
 15. loud: 68
 16. like: 66
 17. grabbed: 65
 18. knife: 64
 19. floor: 64
 20. ran: 64

Most common words for label 'Robbery':
 1. man: 313
 2. using: 127
 3. leaving: 123
 4. saw: 123
 5. money: 105
 6. car: 103
 7. terrifying: 101
 8. wallet: 97
 9. valuables: 91
 10. heard: 90
 11. hand: 88
 12. tactics: 81
 13. left: 77
 14. house: 76
 15. gun: 74
 16. employing: 73
 17. one: 72
 18. menacing: 71
 