In [10]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/

Mounted at /content/drive
/content/drive/MyDrive


In [2]:
# Imports

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset, random_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import codecs

In [12]:
# Setting random seed and device
SEED = 1

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

# TASK 1

In [14]:
# Load data
train_df = pd.read_csv('data/task-1/train.csv')
test_df = pd.read_csv('data/task-1/dev.csv')

In [15]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9652 entries, 0 to 9651
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         9652 non-null   int64  
 1   original   9652 non-null   object 
 2   edit       9652 non-null   object 
 3   grades     9652 non-null   int64  
 4   meanGrade  9652 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 377.2+ KB


In [16]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2419 entries, 0 to 2418
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        2419 non-null   int64 
 1   original  2419 non-null   object
 2   edit      2419 non-null   object
dtypes: int64(1), object(2)
memory usage: 56.8+ KB


In [17]:
train_df['original'][0]

'France is ‘ hunting down its citizens who joined <Isis/> ’ without trial in Iraq'

In [18]:
train_df.head()

Unnamed: 0,id,original,edit,grades,meanGrade
0,14530,France is ‘ hunting down its citizens who join...,twins,10000,0.2
1,13034,"Pentagon claims 2,000 % increase in Russian tr...",bowling,33110,1.6
2,8731,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,22100,1.0
3,76,"In an apparent first , Iran and Israel <engage...",slap,20000,0.4
4,6164,Trump was told weeks ago that Flynn misled <Vi...,school,0,0.0


In [19]:
train_df.sort_values('meanGrade',ascending=False)[['original','edit']]

Unnamed: 0,original,edit
6075,Recent Scandals Highlight Trump 's Chaotic <Ma...,Fashion
1490,President Trump 's first year <anniversary/> r...,Kindergarten
4438,How an FBI raid fed a rumor that Orrin Hatch w...,pregnant
8726,Trump admits tariffs could cause ' pain ' in <...,buttock
6946,Kasich : Trump <tweets/> ‘ unacceptable ’,hair
...,...,...
9250,It did n’t end at the ballot box : <Donald Tru...,cutlery
1237,Delhi Police Say They 've Captured Most-Wanted...,Hindu
9253,Trump 's Climate-Denying Coal Lobbyist Nominee...,Mine
4421,Democratic division simmers at feel-good <retr...,stove


# TASK 2

In [3]:
# Load data
train_df = pd.read_csv('data/task-2/train.csv')
test_df = pd.read_csv('data/task-2/dev.csv')

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9381 entries, 0 to 9380
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          9381 non-null   object 
 1   original1   9381 non-null   object 
 2   edit1       9381 non-null   object 
 3   grades1     9381 non-null   int64  
 4   meanGrade1  9381 non-null   float64
 5   original2   9381 non-null   object 
 6   edit2       9381 non-null   object 
 7   grades2     9381 non-null   int64  
 8   meanGrade2  9381 non-null   float64
 9   label       9381 non-null   int64  
dtypes: float64(2), int64(3), object(5)
memory usage: 733.0+ KB


In [8]:
train_df

Unnamed: 0,id,original1,edit1,grades1,meanGrade1,original2,edit2,grades2,meanGrade2,label
0,10920-9866,""" Gene Cernan , Last <Astronaut/> on the Moon ...",Dancer,1113,1.2,""" Gene Cernan , Last Astronaut on the Moon , <...",impregnated,30001,0.8,1
1,3176-10722,""" I 'm done "" : Fed up with California , some ...",vagrants,1200,0.6,""" I 'm done "" : Fed up with <California/> , so...",pancakes,10110,0.6,0
2,3176-3702,""" I 'm done "" : Fed up with California , some ...",vagrants,1200,0.6,""" I 'm done "" : Fed up with <California/> , so...",life,2,0.4,1
3,10722-3702,""" I 'm done "" : Fed up with <California/> , so...",pancakes,10110,0.6,""" I 'm done "" : Fed up with <California/> , so...",life,2,0.4,1
4,12282-2083,""" Our expectations of what civic engagement lo...",imagine,0,0.0,""" Our expectations of what civic engagement <l...",smells,100220010,0.6,2
...,...,...,...,...,...,...,...,...,...,...
9376,975-13357,"“ It ’s painfully obvious "" Mueller will charg...",battery,1,0.2,"“ It ’s painfully obvious "" Mueller will charg...",plumbing,11103,1.2,2
9377,975-11773,"“ It ’s painfully obvious "" Mueller will charg...",battery,1,0.2,"“ It ’s painfully obvious "" Mueller will <char...",strangle,22331,2.2,2
9378,13357-11773,"“ It ’s painfully obvious "" Mueller will charg...",plumbing,11103,1.2,"“ It ’s painfully obvious "" Mueller will <char...",strangle,22331,2.2,2
9379,14954-14479,"“ Kompromat , ” media ethics and the law : Wha...",porn,20101,0.8,"“ Kompromat , ” media ethics and the law : Wha...",dance,32112,1.8,2


In [22]:
funniest = train_df.sort_values('meanGrade1',ascending=False)[['original1','edit1','meanGrade1','edit2']]

In [25]:
for i in range(5):
    print(funniest['original1'][i],funniest['edit1'][i],' vs ',funniest['edit2'][i])

" Gene Cernan , Last <Astronaut/> on the Moon , Dies at 82 " Dancer  vs  impregnated
" I 'm done " : Fed up with California , some <conservatives/> look to Texas vagrants  vs  pancakes
" I 'm done " : Fed up with California , some <conservatives/> look to Texas vagrants  vs  life
" I 'm done " : Fed up with <California/> , some conservatives look to Texas pancakes  vs  life
" Our expectations of what civic engagement looks like do n’t match reality . Can we <fix/> that ? " imagine  vs  smells
