#### Experiments with Article  Content ONLY

### Libraries

In [1]:
from util import *
import pickle



### Data Cleaning

- We are loading the data from both dawn and express tribune.

In [2]:
data = pd.read_csv('articles-annotated.csv',encoding = "ISO-8859-1")

In [3]:
# drop rows where no label given
data = data.dropna()

In [4]:
# Converting label from floats to int
data['Label'] = data['Label'].astype(int)
# Saving cleaned data
data.to_csv('cleaned-data.csv',index=False)

In [5]:
data = pd.read_csv('cleaned-data.csv')

### Preprocessing

In [6]:
def remove_non_ascii(words):
    words = [unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore') for word in words]
    return words
# The first pre-processing step which we will do is transform our tweets into lower case.
# This avoids having multiple copies of the same words
def to_lowercase(words):
    words = [word.lower() for word in words]
    return words
# Removing punctuation to reduce the amount of the training data
def remove_punctuation(words):
    words = [re.sub(r'[^\w\s]', '', word) for word in words if re.sub(r'[^\w\s]', '', word) not in '']    
    return words
# Removing numbers from data since they aren't useful in this context.
def replace_numbers(words):
    words = [re.sub(r'[^\w\s]', '', word) for word in words]
    return words
def remove_stopwords(words):
    words = [word for word in words if word not in stopwords.words('english')]
    return words

In [7]:
def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words

In [8]:
data['tokenized_Content'] = data['Content'].apply(word_tokenize)
data['tokenized_Content'] = data['tokenized_Content'].apply(normalize)

In [9]:
# Saving the processed data to a csv file
data.to_csv("preprocessed.csv",index=False)

In [10]:
data = pd.read_csv('preprocessed.csv')

#### Logistic Regression with TFIDF

In [11]:
X = data['tokenized_Content']
y = data['Label']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=1234)
print('X_train',X_train.shape)
print('X_test',X_test.shape)
print('y_train',y_train.shape)
print('y_test',y_test.shape)

X_train (179,)
X_test (45,)
y_train (179,)
y_test (45,)


In [12]:
#The following parameters were the best according to our dataset
tdf = TfidfVectorizer(stop_words='english',max_df = 0.65,min_df = 0.001,ngram_range=(1,1))
vectorizer = tdf.fit(X_train)
train_transformed = vectorizer.transform(X_train)

In [14]:
model = LogisticRegression(penalty = 'l1')
model = model.fit(train_transformed , y_train)







#### Saving trained model

In [None]:
#saving tfidf model
pickle.dump(vectorizer, open("tfidf.pickle", "wb"))

In [15]:
#saving logistic regression
filename = 'logistic_regression_TFIDF.sav'
pickle.dump(model, open(filename, 'wb'))

#### Model Statistics
- Uncomment to see the model performance

In [31]:
# # Score only on the test set. NO CV
# print ("Logistig Regression: \n")
# print ( "F1 score {:.4}%".format(f1_score(y_test, predicted, average='macro')*100 ) )
# print ( "Accuracy score {:.4}%\n\n".format(accuracy_score(y_test, predicted)*100) )
# error = mean_squared_error(y_test, predicted)
# print('MSE',error)
# print(metrics.classification_report(y_test, predicted,target_names=['Fake','Unverified','Real']))