# Exploring data

Since the first model trained had some performance issues, we need to better evaluate the data. 

In [1]:
import pandas as pd
from pathlib import Path
import re
from typing import List

In [None]:
here = Path().cwd()
cbdata_path = here / ".data_sets" / "ClaimBuster_Datasets" / "datasets" # ClaimBuster data location
raw_dfs: List[pd.DataFrame] = []

for file in cbdata_path.iterdir():
    if file.exists() and file.is_file() and file.suffix == ".json":
        tmp: pd.DataFrame = pd.read_json(file)
        raw_dfs.append(tmp)

assert len(raw_dfs) > 0

for i, j in enumerate(raw_dfs):
    assert j is not None
    assert type(j) is pd.DataFrame
    print(f"--- part {i:2} ---")
    print(j.head())
    print(j.describe())

--- part  0 ---
   sentence_id  label                                               text
0        27247      1                We're 9 million jobs short of that.
1        10766      1  You know, last year up to this time, we've los...
2         3327      1  And in November of 1975 I was the first presid...
3        19700      1  And what we've done during the Bush administra...
4        12600      1  Do you know we don't have a single program spo...
        sentence_id        label
count   9674.000000  9674.000000
mean   16268.353628     0.285714
std     9388.575939     0.451777
min       16.000000     0.000000
25%     8344.000000     0.000000
50%    16455.500000     0.000000
75%    24086.250000     1.000000
max    34458.000000     1.000000
--- part  1 ---
   sentence_id  label                                               text
0        15083      1  When I made my decision to stop all trade with...
1        16799      1  We've got the highest inflation we've had in t...
2        32570

In [3]:
df = pd.concat(raw_dfs)
print(df.describe())
print(f"Dataset Size: {len(df)}")

        sentence_id         label
count  29022.000000  29022.000000
mean   16281.469161      0.285714
std     9401.659478      0.451762
min       16.000000      0.000000
25%     8384.500000      0.000000
50%    16455.500000      0.000000
75%    24089.000000      1.000000
max    34458.000000      1.000000
Dataset Size: 29022


## Analysis of data

In [4]:
print("=== CLAIMBUSTERS DATA QUALITY ANALYSIS ===\n")

# 1. Check label distribution
print("1. LABEL DISTRIBUTION:")
print(f"Total samples: {len(df)}")
print(f"Claims (LABEL_1): {len(df[df['label'] == 1])} ({len(df[df['label'] == 1])/len(df)*100:.1f}%)")
print(f"Non-claims (LABEL_0): {len(df[df['label'] == 0])} ({len(df[df['label'] == 0])/len(df)*100:.1f}%)")

# 2. Look at clearly factual statements that are labeled as non-claims
print("\n2. SUSPICIOUS NON-CLAIM LABELS (should probably be claims):")
factual_patterns = [
    r'\b\d{4}\b',  # Years
    r'\b\d+%\b',   # Percentages  
    r'\$\d+',      # Dollar amounts
    r'\b\d+\.\d+\b', # Decimal numbers
    r'\bwas\s+(president|senator|governor|born|elected)\b',  # Factual verbs
    r'\bin\s+\d{4}\b',  # "in [year]"
]

suspicious_non_claims = []
for _, row in df[df['label'] == 0].iterrows():
    text = row['text'].lower()
    for pattern in factual_patterns:
        if re.search(pattern, text, re.IGNORECASE):
            suspicious_non_claims.append(row['text'])
            break

print(f"Found {len(suspicious_non_claims)} suspicious non-claim labels")
print("Sample suspicious non-claims:")
for i, text in enumerate(suspicious_non_claims[:15], 1):
    print(f"{i:2d}. {text}")

# 3. Look at claims that seem like opinions
print("\n3. SUSPICIOUS CLAIM LABELS (should probably be non-claims):")
opinion_patterns = [
    r'\b(best|worst|great|terrible|amazing|awful)\b',
    r'\b(should|must|need to|have to)\b',
    r'\b(beautiful|ugly|smart|stupid)\b',
    r'\b(love|hate|like|dislike)\b',
    r'\b(believe|think|feel)\b',
]

suspicious_claims = []
for _, row in df[df['label'] == 1].iterrows():
    text = row['text'].lower()
    for pattern in opinion_patterns:
        if re.search(pattern, text, re.IGNORECASE):
            suspicious_claims.append(row['text'])
            break

print(f"Found {len(suspicious_claims)} suspicious claim labels")
print("Sample suspicious claims:")
for i, text in enumerate(suspicious_claims[:15], 1):
    print(f"{i:2d}. {text}")

# 4. Check for very short or very long samples
print("\n4. SAMPLE LENGTH ANALYSIS:")
df['length'] = df['text'].str.len()
print(f"Average text length: {df['length'].mean():.1f} characters")
print(f"Shortest text: {df['length'].min()} chars")
print(f"Longest text: {df['length'].max()} chars")

# Show very short samples
short_samples = df[df['length'] < 20]
print(f"\nVery short samples ({len(short_samples)}):")
for _, row in short_samples.head(10).iterrows():
    print(f"  '{row['text']}' → Label: {row['label']}")

# 5. Check for exact duplicates
print(f"\n5. DUPLICATE ANALYSIS:")
duplicates = df[df.duplicated(subset=['text'], keep=False)]
print(f"Total duplicate texts: {len(duplicates)}")
if len(duplicates) > 0:
    print("Sample duplicates with different labels:")
    for text in duplicates['text'].unique()[:5]:
        text_samples = df[df['text'] == text]
        labels = text_samples['label'].unique()
        if len(labels) > 1:
            print(f"  '{text}' has labels: {labels}")

=== CLAIMBUSTERS DATA QUALITY ANALYSIS ===

1. LABEL DISTRIBUTION:
Total samples: 29022
Claims (LABEL_1): 8292 (28.6%)
Non-claims (LABEL_0): 20730 (71.4%)

2. SUSPICIOUS NON-CLAIM LABELS (should probably be claims):
Found 245 suspicious non-claim labels
Sample suspicious non-claims:
 1. We feel that you can hold the line and restrain federal spending, give a tax reduction and still have a balanced budget by 1978.
 2. And for every dollar that I spend in those two categories, I'll put $2 toward paying down the national debt.
 3. And in the years to come it will be written that one or the other of us was elected and that he was or was not a great president.
 4. In 1957 I was in Havana.
 5. I think uh - the way to get tax equity in this country is to give tax relief to the middle-income people who have an income from roughly $8 thousand up to twenty-five or thirty thousand dollars.
 6. We've got to innovate through this America 2000 program.
 7. George, we have supply management today und