### Computational Social Intelligence (H)
# Assessed Coursework

In [1]:
import difflib
import numpy as np
import pandas as pd
from IPython.display import Markdown


def print_md(text):
    display(Markdown(text))

def enum(**enums):
    return type('Enum', (), enums)

## Part 1

### Loading data

In [2]:
laughter_corpus = pd.read_csv("data/laughter-corpus.csv")

laughter_corpus

Unnamed: 0,Gender,Role,Duration
0,Female,Caller,0.961
1,Male,Receiver,0.630
2,Female,Caller,1.268
3,Male,Receiver,0.146
4,Female,Caller,0.276
...,...,...,...
837,Female,Caller,0.328
838,Female,Caller,0.167
839,Female,Receiver,0.533
840,Female,Receiver,0.353


## Is the number of laughter events higher for women than for men?

**Research hypothesis:** The number of laughter events is higher for women than for men.

**Null hypothesis:** The number of laughter events for women and men are the same.

In [16]:
events = len(laughter_corpus)
speakers = 120
male = 57
female = 63

expected_male_events = (male/speakers) * events
expected_female_events = (female/speakers) * events

actual_male_events = len(laughter_corpus.query("Gender == 'Male'"))
actual_female_events = len(laughter_corpus.query("Gender == 'Female'"))

mf_chi_square = (
    (
        ((actual_male_events - expected_male_events)**2)
        / expected_male_events
    )
    + (
        ((actual_female_events - expected_female_events)**2)
        / expected_female_events
    )
)

print_md(
f"""
### Null hypothesis {
    "can be rejected" if abs(mf_chi_square) > 6.63 else "is true"
}. ({mf_chi_square:.2f})

|            |         Expected         |         Actual         |
| ---------- | ------------------------ | ---------------------- |
| **Male**   |  {expected_male_events}  |  {actual_male_events}  |
| **Female** | {expected_female_events} | {actual_female_events} |
"""
)


### Null hypothesis can be rejected. (13.86)

|            |         Expected         |         Actual         |
| ---------- | ------------------------ | ---------------------- |
| **Male**   |  399.95  |  346  |
| **Female** | 442.05 | 496 |


## Is the number of laughter events higher for callers than for receivers?

**Research hypothesis:** The number of laughter events is higher for callers than for receivers.

**Null hypothesis:** The number of laughter events for callers and receivers are the same.

In [18]:
events = len(laughter_corpus)
callers = 60
receivers = 60

expected_caller_events = (callers/(callers+receivers)) * events
expected_receiver_events = (receivers/(callers+receivers)) * events

actual_caller_events = len(laughter_corpus.query("Role == 'Caller'"))
actual_receiver_events = len(laughter_corpus.query("Role == 'Receiver'"))

cr_chi_square = (
    (
        ((actual_caller_events - expected_caller_events)**2)
        / expected_caller_events
    )
    + (
        ((actual_receiver_events - expected_receiver_events)**2)
        / expected_receiver_events
    )
)

print_md(
f"""
### Null hypothesis {
    "can be rejected" if abs(cr_chi_square) > 6.63 else "is true"
}. ({cr_chi_square:.2f})

|               |          Expected          |          Actual          |
| ------------- | -------------------------- | ------------------------ |
| **Callers**   |  {expected_caller_events}  |  {actual_caller_events}  |
| **Receivers** | {expected_receiver_events} | {actual_receiver_events} |
"""
)


### Null hypothesis can be rejected. (33.52)

|               |          Expected          |          Actual          |
| ------------- | -------------------------- | ------------------------ |
| **Callers**   |  421.0  |  505  |
| **Receivers** | 421.0 | 337 |


## Are laughter events longer for women?

**Research hypothesis:** Laughter events are longer for women.

**Null hypothesis:** Laughter events for women have the same duration as laughter events for men.

In [19]:
male_duration = laughter_corpus.query("Gender == 'Male'")["Duration"]
female_duration = laughter_corpus.query("Gender == 'Female'")["Duration"]

total_male_duration = male_duration.sum()
total_female_duration = female_duration.sum()

mean_male_duration = male_duration.mean()
mean_female_duration = female_duration.mean()

variance_male_duration = male_duration.var()
variance_female_duration = female_duration.var()

mf_t_value = (
    (mean_male_duration - mean_female_duration)
    / (
        (variance_male_duration / len(male_duration))
        + (variance_female_duration / len(female_duration))
    )**(1/2)
)

print_md(
f"""
### Null hypothesis {
    "can be rejected" if abs(mf_t_value) > 2.36 else "is true"
}. ({mf_t_value:.2f})

|            |          Total          |          Mean          |          Variance          |
| ---------- | ----------------------- | ---------------------- | -------------------------- |
| **Male**   |  {total_male_duration}  |  {mean_male_duration}  |  {variance_male_duration}  |
| **Female** | {total_female_duration} | {mean_female_duration} | {variance_female_duration} |
"""
)


### Null hypothesis can be rejected. (-3.51)

|            |          Total          |          Mean          |          Variance          |
| ---------- | ----------------------- | ---------------------- | -------------------------- |
| **Male**   |  209.75600000000003  |  0.6062312138728323  |  0.16375125653011646  |
| **Female** | 352.004 | 0.7096854838709694 | 0.19502413118279563 |


## Are laughter events longer for callers?

**Research hypothesis:** Laughter events are longer for callers.

**Null hypothesis:** Laughter events for callers have the same duration as laughter events for receivers.

In [20]:
caller_duration = laughter_corpus.query("Role == 'Caller'")["Duration"]
receiver_duration = laughter_corpus.query("Role == 'Receiver'")["Duration"]

total_caller_duration = caller_duration.sum()
total_receiver_duration = receiver_duration.sum()

mean_caller_duration = caller_duration.mean()
mean_receiver_duration = receiver_duration.mean()

variance_caller_duration = caller_duration.var()
variance_receiver_duration = receiver_duration.var()

cr_t_value = (
    (mean_caller_duration - mean_receiver_duration)
    / (
        (variance_caller_duration / len(caller_duration))
        + (variance_receiver_duration / len(receiver_duration))
    )**(1/2)
)

print_md(
f"""
### Null hypothesis {
    "can be rejected" if abs(cr_t_value) > 2.36 else "is true"
}. ({cr_t_value:.2f})

|               |           Total           |           Mean           |           Variance           |
| ------------- | ------------------------- | ------------------------ | ---------------------------- |
| **Callers**   |  {total_caller_duration}  |  {mean_caller_duration}  |  {variance_caller_duration}  |
| **Receivers** | {total_receiver_duration} | {mean_receiver_duration} | {variance_receiver_duration} |
"""
)


### Null hypothesis can be rejected. (7.05)

|               |           Total           |           Mean           |           Variance           |
| ------------- | ------------------------- | ------------------------ | ---------------------------- |
| **Callers**   |  376.61199999999997  |  0.7457663366336641  |  0.2126547944994499  |
| **Receivers** | 185.14800000000002 | 0.5494005934718107 | 0.11976459202345631 |


## Part 2

## Gaussian Discriminant Function

In [7]:
def gaussian_discriminant_function(vector, likelihood, deviation):
    return -np.sum(
        np.log(
            np.sqrt(2 * np.pi) * deviation
        )
        + (
            (
                (vector.to_numpy()[:-1] - likelihood.to_numpy())**2
            )
            /(2 * (deviation**2))
        )
    )

index = enum(SMILE=0, FROWN=1)

for f in ["training-part-2.csv", "test-part-2.csv"]:
    file_to_test = pd.read_csv(f"data/{f}")

    likelihoods = [
        (
            file_to_test.query(f"Class == '{c}'").sum(numeric_only=True)
            / len(file_to_test.query(f"Class == '{c}'"))
        )
        for c in ["smile", "frown"]
    ]

    standard_deviation = file_to_test.std()

    predictions = [
        (
            "smile"
            if (
                gaussian_discriminant_function(
                    vector, likelihoods[index.SMILE], standard_deviation
                )
                > gaussian_discriminant_function(
                    vector, likelihoods[index.FROWN], standard_deviation
                )
            )
            else "frown"
        ) for vector in file_to_test.iloc
    ]

    print_md(
f"""
### The error rate of the prediction for `{f}` is {
    round(
        (1 - difflib.SequenceMatcher(
            None, predictions, list(file_to_test["Class"])).ratio()
        ) * 100,
        2,
    )
}%.

|   | Prediction | Actual |
| - | ---------- | ------ |
"""
+ "\n".join(
    f"""| **{index}** | {(
        prediction if prediction == actual else f'_**{prediction}**_'
    )} | {actual} |"""
    for index, (prediction, actual) in enumerate(
        zip(predictions, list(file_to_test["Class"]))
    )
)
    )


### The error rate of the prediction for `training-part-2.csv` is 2.78%.

|   | Prediction | Actual |
| - | ---------- | ------ |
| **0** | frown | frown |
| **1** | frown | frown |
| **2** | frown | frown |
| **3** | frown | frown |
| **4** | frown | frown |
| **5** | frown | frown |
| **6** | frown | frown |
| **7** | frown | frown |
| **8** | frown | frown |
| **9** | frown | frown |
| **10** | frown | frown |
| **11** | frown | frown |
| **12** | frown | frown |
| **13** | frown | frown |
| **14** | frown | frown |
| **15** | frown | frown |
| **16** | frown | frown |
| **17** | frown | frown |
| **18** | smile | smile |
| **19** | smile | smile |
| **20** | smile | smile |
| **21** | smile | smile |
| **22** | smile | smile |
| **23** | smile | smile |
| **24** | smile | smile |
| **25** | smile | smile |
| **26** | smile | smile |
| **27** | smile | smile |
| **28** | smile | smile |
| **29** | smile | smile |
| **30** | smile | smile |
| **31** | smile | smile |
| **32** | smile | smile |
| **33** | smile | smile |
| **34** | _**frown**_ | smile |
| **35** | smile | smile |


### The error rate of the prediction for `test-part-2.csv` is 0.0%.

|   | Prediction | Actual |
| - | ---------- | ------ |
| **0** | frown | frown |
| **1** | frown | frown |
| **2** | frown | frown |
| **3** | frown | frown |
| **4** | frown | frown |
| **5** | frown | frown |
| **6** | frown | frown |
| **7** | frown | frown |
| **8** | smile | smile |
| **9** | smile | smile |
| **10** | smile | smile |
| **11** | smile | smile |
| **12** | smile | smile |
| **13** | smile | smile |
| **14** | smile | smile |
| **15** | smile | smile |