## 00 |

In [1]:
# [Basic Libraries]
import os # file management 
import pandas as pd # data manipulation
import numpy as np # numerical operations

# [Machine Learning]
from sklearn.preprocessing import StandardScaler # data normalization
from sklearn.model_selection import train_test_split # train/test sets
from sklearn.linear_model import LogisticRegression # logistic model
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, cohen_kappa_score # confussion matrix metric & displayer, cohen kappa score

# [New!] How to deal with imbalanced Data?
from imblearn.under_sampling import RandomUnderSampler # Random undersampling technique
from imblearn.over_sampling import RandomOverSampler # Random oversampling technique
from imblearn.under_sampling import TomekLinks # Undersampling librarie technique
from imblearn.over_sampling import SMOTE # Oversampling librarie technique
from sklearn.metrics import classification_report # Metrics to check & compare our score

# [Settings]
import warnings
warnings.filterwarnings('ignore') # ignore warnings
pd.set_option('display.max_columns', None) # display all columns

In [2]:
# Basic functions
def data_info(data): # improved data.info()
    print(f"Data shape is {data.shape}.")
    print()
    print(data.dtypes)
    print()
    print("Data row sample and full columns:")
    return data.sample(5)

def clean_columns(data): # Standardizes the column name, returns the columns
    data.columns = [i.lower().replace(' ', '_') for i in data.columns]
    return data.sample(0)

## 01 | Data Extraction

In [3]:
data = pd.read_csv('customer_churn.csv',sep=",")
data_info(data)

Data shape is (7043, 21).

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

Data row sample and full columns:


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
1117,6900-RBKER,Male,0,No,No,52,Yes,Yes,Fiber optic,No,No,Yes,No,No,Yes,Two year,Yes,Credit card (automatic),89.45,4577.75,No
2670,4531-AUZNK,Female,0,Yes,Yes,51,Yes,Yes,Fiber optic,Yes,No,No,Yes,Yes,No,One year,Yes,Mailed check,95.15,5000.05,No
3993,9769-TSBZE,Female,0,No,Yes,70,Yes,Yes,DSL,Yes,Yes,No,Yes,No,No,Two year,No,Electronic check,66.0,4891.5,No
688,0946-FKYTX,Male,0,No,No,52,No,No phone service,DSL,No,Yes,No,No,No,No,One year,No,Mailed check,30.1,1623.4,No
5888,8610-WFCJF,Female,0,Yes,Yes,49,Yes,Yes,Fiber optic,No,No,No,No,Yes,Yes,Month-to-month,Yes,Electronic check,95.6,4783.5,Yes


<div class="alert alert-block alert-info">
    
**Binnary Classification Prbolem**
    
We want to predict `churn` (AKA; Is the customer still doing business with us?) **yes** or **no** using `seniorcitizen`,	`tenure`, `monthlycharges`as features (we will select them by selecting only numericals.
    
The final **goal** of this lab is to learn how to deal with imbalanced data. So... let's walk thorugh it in our own shoes!
</div>

## 02 | Data Wrangling

In [4]:
# Good practices, not necessary for a lab
# c_data = data.copy()

In [5]:
clean_columns(data) # standarize columns

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn


In [6]:
# How is our target data distributed?
data['churn'].value_counts()

No     5174
Yes    1869
Name: churn, dtype: int64

<blockquote style="background-color: #ffeeba; color: #856404; border-color: #ffeeba; padding: 10px; border-radius: 5px;">

There are is a huge imbalance representation from the two categories    
</blockquote>

In [7]:
5174/(5174+1869)*100

73.4630129206304

<blockquote style="background-color: #d4edda; color: #155724; border-color: #c3e6cb; padding: 10px; border-radius: 5px;">
    
There is a **73%** percentatge imbalance
    
</blockquote>

### Selecting Numericals

In [8]:
# Selecting our futures
n = data.select_dtypes(exclude=np.object)
n.head()

Unnamed: 0,seniorcitizen,tenure,monthlycharges
0,0,1,29.85
1,0,34,56.95
2,0,2,53.85
3,0,45,42.3
4,0,2,70.7


## 02 | Data Processing

### Normalizing the data

In [9]:
# Normalazing the data
sd = StandardScaler().fit(n)

# Transformation
n_scaled = sd.transform(n)

# Making it a panda dataframe
ns = pd.DataFrame(n_scaled)

# Keeping old column names
ns.columns = n.columns

ns.head()

Unnamed: 0,seniorcitizen,tenure,monthlycharges
0,-0.439916,-1.277445,-1.160323
1,-0.439916,0.066327,-0.259629
2,-0.439916,-1.236724,-0.36266
3,-0.439916,0.514251,-0.746535
4,-0.439916,-1.236724,0.197365


### X-Y Split

In [10]:
# We need to be careful about when and how we do the split
Y = data['churn']
X = ns

## 03 | Modeling

In [11]:
# We define train and test for X and Y
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42) 

# test_size = We give 30% for testing and 70% for testing
# random_state = it'll improve the model to divide always the model in the same way

In [12]:
len(X) # Len before the test

7043

In [13]:
len(X_test) # Len after testing 30% of X data

2113

In [14]:
len(X_train) # Len after training 70% of X data

4930

### Logistic Regression

In [15]:
# We traing the training dataset using Logistic Regression
logistic = LogisticRegression(random_state=0).fit(X_train, y_train)

In [16]:
# Predictions
predictions = logistic.predict(X_test)

In [17]:
logistic.score(X_test, y_test)

0.7936583057264552

<blockquote style="background-color: #ffeeba; color: #856404; border-color: #ffeeba; padding: 10px; border-radius: 5px;">

Even with the imbalance, we got a 79% score.
</blockquote>

In [18]:
logistic_1 = classification_report(y_test, predictions)
print(logistic_1)

              precision    recall  f1-score   support

          No       0.82      0.92      0.87      1539
         Yes       0.68      0.45      0.54       574

    accuracy                           0.79      2113
   macro avg       0.75      0.69      0.70      2113
weighted avg       0.78      0.79      0.78      2113



![logo_ironhack_blue 7](https://user-images.githubusercontent.com/23629340/40541063-a07a0a8a-601a-11e8-91b5-2f13e4e6b441.png)

# LabTime| Dealing with imbalanced data

### Practice&Repeat; Increasing the imbalance of `yes`
(to see how it affects the model performance )

## Manually:

In [19]:
yes = data[data['churn']=='Yes'] # We select from data;'Yes' value from 'churn' column in data
no = data[data['churn']=='No'] # And the Nays ;)
len(yes) # How many yes do we have

1869

In [20]:
yes = yes.sample(500) # We take a sample of 500 yes
len(yes)

500

In [21]:
result = pd.merge(yes, no) # just a reminder. merge fails because there are no matching rows
result.head(10000400000290000)

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn


In [22]:
data2 = pd.concat([yes,no], axis=0) # We do it with concat
data2.sample(5)

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
5018,6928-ONTRW,Female,0,Yes,Yes,72,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),19.7,1379.8,No
5830,6754-WKSHP,Male,0,No,Yes,30,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,No,Bank transfer (automatic),25.35,723.3,No
4970,9795-VOWON,Male,0,No,No,7,No,No phone service,DSL,No,No,No,No,No,No,One year,Yes,Credit card (automatic),24.35,150.85,No
3391,7234-KMNRQ,Male,0,No,No,4,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Mailed check,19.0,73.45,No
3921,7964-YESJC,Female,0,Yes,No,24,Yes,Yes,DSL,Yes,Yes,Yes,No,No,No,Month-to-month,No,Mailed check,66.3,1559.45,No


In [23]:
data2['churn'].value_counts() # Now we have:

No     5174
Yes     500
Name: churn, dtype: int64

In [24]:
data['churn'].value_counts() # And before:

No     5174
Yes    1869
Name: churn, dtype: int64

In [25]:
# Mixing the Data
data2 = data2.sample(frac=1) # Frac = How many data we want to mix 1=100%, 0.5=50%, so on...
data2.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
5081,5707-ZMDJP,Male,0,Yes,Yes,53,Yes,No,DSL,Yes,Yes,No,Yes,Yes,No,Two year,Yes,Mailed check,69.7,3729.6,No
3391,7234-KMNRQ,Male,0,No,No,4,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Mailed check,19.0,73.45,No
5507,5753-QQWPW,Female,0,No,No,28,Yes,No,DSL,Yes,Yes,No,Yes,No,No,One year,Yes,Electronic check,59.9,1654.7,No
4637,9470-YFUYI,Male,1,Yes,No,71,Yes,No,DSL,Yes,Yes,No,Yes,Yes,No,One year,Yes,Bank transfer (automatic),71.0,5012.1,No
5864,5093-FEGLU,Female,0,Yes,No,47,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Bank transfer (automatic),19.65,921.55,No


### X-Y Train Split

In [26]:
Y = data2['churn']
X = data2.select_dtypes(include=[np.number]) # Selecting Numericals
X.head()

Unnamed: 0,seniorcitizen,tenure,monthlycharges
5081,0,53,69.7
3391,0,4,19.0
5507,0,28,59.9
4637,1,71,71.0
5864,0,47,19.65


### Modeling

In [27]:
yes = data[data['churn']=='Yes'].sample(5174, replace=True) # Replace "True" to duplicate data, 5174 to fill it with the same number of Nays and Yeys
no = data[data['churn']=='No']
data3 = pd.concat([yes,no], axis=0)
data3 = data3.sample(frac=1)
data3['churn'].value_counts()

No     5174
Yes    5174
Name: churn, dtype: int64

In [28]:
# Normalazing the data
transformer = StandardScaler().fit(X)

# Transformation
X = transformer.transform(X)

# We define train and test for X and Y
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42) 

# test_size = We give 30% for testing and 70% for testing
# random_state = it'll improve the model to divide always the model in the same way

### Logistic Regression

In [29]:
# We traing the training dataset using Logistic Regression
logistic = LogisticRegression(random_state=0).fit(X_train, y_train)

# Predictions
predictions = logistic.predict(X_test)

confusion_matrix(y_test, predictions)

logistic.score(X_test, y_test)

0.9036993540810335

<blockquote style="background-color: #ffeeba; color: #856404; border-color: #ffeeba; padding: 10px; border-radius: 5px;">

After increasing the the imbalance, we got a 92% score. Is that all? Why is our model better after reducing even futher the number of `yes`?
</blockquote>

In [30]:
logistic_2 = classification_report(y_test, predictions)
print(logistic_2)

              precision    recall  f1-score   support

          No       0.91      1.00      0.95      1540
         Yes       0.43      0.02      0.04       163

    accuracy                           0.90      1703
   macro avg       0.67      0.51      0.49      1703
weighted avg       0.86      0.90      0.86      1703



In [31]:
print(logistic_1) # Comparing it with previous results

              precision    recall  f1-score   support

          No       0.82      0.92      0.87      1539
         Yes       0.68      0.45      0.54       574

    accuracy                           0.79      2113
   macro avg       0.75      0.69      0.70      2113
weighted avg       0.78      0.79      0.78      2113



<blockquote style="background-color: #d4edda; color: #155724; border-color: #c3e6cb; padding: 10px; border-radius: 5px;">
    
Yes, our score is better but not the other metrics (macro avg, weighted avg) because the model is worst at predicting the `yes`
    
</blockquote>

## Undersampling with `RandomUnderSampler`
We will now apply downsampling using `RandomUnderSampler` library, and then upsampling with `RandomOverSampler` to see how it affects the model:

In [32]:
# Selecting our library
rus = RandomUnderSampler()

### X-Y Train Split

In [33]:
y = data['churn']
X = data.select_dtypes(include=[np.number]) # Selecting Numericals
X.head()

Unnamed: 0,seniorcitizen,tenure,monthlycharges
0,0,1,29.85
1,0,34,56.95
2,0,2,53.85
3,0,45,42.3
4,0,2,70.7


### Modeling

In [34]:
# Normalazing the data
transformer = StandardScaler().fit(X)

# Transformation
X = transformer.transform(X)

# Using RandomUnderSampler
X_rus, y_rus = rus.fit_resample(X, y)
transformer = StandardScaler().fit(X_rus)
X = transformer.transform(X_rus)

# We define train and test for X and Y
X_train, X_test, y_train, y_test = train_test_split(X, y_rus, test_size=0.3, random_state=100)

# test_size = We give 30% for testing and 70% for testing
# random_state = it'll improve the model to divide always the model in the same way

In [35]:
y.value_counts()

No     5174
Yes    1869
Name: churn, dtype: int64

In [36]:
y_rus.value_counts()

No     1869
Yes    1869
Name: churn, dtype: int64

In [37]:
# Ta-Da!

### Logistic Regression

In [38]:
# We traing the training dataset using Logistic Regression
logistic = LogisticRegression(random_state=0).fit(X_train, y_train)

# Predictions
predictions = logistic.predict(X_test)

logistic.score(X_test, y_test)

0.7379679144385026

In [39]:
logistic_3 = classification_report(y_test, predictions)
print(logistic_3)

              precision    recall  f1-score   support

          No       0.75      0.73      0.74       575
         Yes       0.72      0.75      0.74       547

    accuracy                           0.74      1122
   macro avg       0.74      0.74      0.74      1122
weighted avg       0.74      0.74      0.74      1122



<blockquote style="background-color: #d4edda; color: #155724; border-color: #c3e6cb; padding: 10px; border-radius: 5px;">
    
Our score is now more even (73%). Compared to the previous:
    
</blockquote>

In [40]:
print(logistic_2) # Manually increased imbalanced

              precision    recall  f1-score   support

          No       0.91      1.00      0.95      1540
         Yes       0.43      0.02      0.04       163

    accuracy                           0.90      1703
   macro avg       0.67      0.51      0.49      1703
weighted avg       0.86      0.90      0.86      1703



In [41]:
print(logistic_1) # Without any changes

              precision    recall  f1-score   support

          No       0.82      0.92      0.87      1539
         Yes       0.68      0.45      0.54       574

    accuracy                           0.79      2113
   macro avg       0.75      0.69      0.70      2113
weighted avg       0.78      0.79      0.78      2113



<blockquote style="background-color: #d4edda; color: #155724; border-color: #c3e6cb; padding: 10px; border-radius: 5px;">

Can we make it better?
</blockquote>

## Oversampling wih `RandomOverSampler`
Time for upsampling with `RandomOverSampler` to see how it affects the model:

In [42]:
# Selecting our library
ros = RandomOverSampler()

### X-Y Train Split

In [43]:
y = data['churn']
X = data.select_dtypes(include=[np.number]) # Selecting Numericals
X.head()

Unnamed: 0,seniorcitizen,tenure,monthlycharges
0,0,1,29.85
1,0,34,56.95
2,0,2,53.85
3,0,45,42.3
4,0,2,70.7


### Modeling

In [44]:
# Normalazing the data
transformer = StandardScaler().fit(X)

# Transformation
X = transformer.transform(X)

# Using RandomUnderSampler
X_ros, y_ros = ros.fit_resample(X, y)
transformer = StandardScaler().fit(X_ros)
X = transformer.transform(X_ros)

# We define train and test for X and Y
X_train, X_test, y_train, y_test = train_test_split(X, y_ros, test_size=0.3, random_state=100)

# test_size = We give 30% for testing and 70% for testing
# random_state = it'll improve the model to divide always the model in the same way

In [45]:
y.value_counts()

No     5174
Yes    1869
Name: churn, dtype: int64

In [46]:
y_rus.value_counts()

No     1869
Yes    1869
Name: churn, dtype: int64

In [47]:
# Ta-Da!

### Logistic Regression

In [48]:
# We traing the training dataset using Logistic Regression
logistic = LogisticRegression(random_state=0).fit(X_train, y_train)

# Predictions
predictions = logistic.predict(X_test)

logistic.score(X_test, y_test)

0.7407407407407407

In [49]:
logistic_4 = classification_report(y_test, predictions)
print(logistic_4)

              precision    recall  f1-score   support

          No       0.74      0.75      0.74      1557
         Yes       0.74      0.74      0.74      1548

    accuracy                           0.74      3105
   macro avg       0.74      0.74      0.74      3105
weighted avg       0.74      0.74      0.74      3105



<blockquote style="background-color: #d4edda; color: #155724; border-color: #c3e6cb; padding: 10px; border-radius: 5px;">
    
Our score is overall **1% better** after Upsampling (so it's better than undersampling, in this case. Question is, can this be improved?
</blockquote>

## Oversampling with `SMOTE`
SMOTE = SynthetiC Minority Oversampling TEchnique. If we have a library for a specific technique -> Use it!

In [50]:
# Selecting our library
smote = SMOTE()

### X-Y Train Split

In [51]:
y = data['churn']
X = data.select_dtypes(include=[np.number]) # Selecting Numericals
X.head()

Unnamed: 0,seniorcitizen,tenure,monthlycharges
0,0,1,29.85
1,0,34,56.95
2,0,2,53.85
3,0,45,42.3
4,0,2,70.7


### Modeling

In [52]:
# Normalazing the data
transformer = StandardScaler().fit(X)

# Transformation
X = transformer.transform(X)

# Using RandomUnderSampler
X_sm, y_sm = smote.fit_resample(X, y)

# We define train and test for X and Y
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.3, random_state=100)

# test_size = We give 30% for testing and 70% for testing
# random_state = it'll improve the model to divide always the model in the same way

### Logistic Regression

In [53]:
# We traing the training dataset using Logistic Regression
logistic = LogisticRegression(random_state=0).fit(X_train, y_train)

# Predictions
predictions = logistic.predict(X_test)

logistic.score(X_test, y_test)

0.7439613526570048

In [54]:
logistic_5 = classification_report(y_test, predictions)
print(logistic_5)

              precision    recall  f1-score   support

          No       0.74      0.75      0.75      1557
         Yes       0.74      0.74      0.74      1548

    accuracy                           0.74      3105
   macro avg       0.74      0.74      0.74      3105
weighted avg       0.74      0.74      0.74      3105



<blockquote style="background-color: #d4edda; color: #155724; border-color: #c3e6cb; padding: 10px; border-radius: 5px;">
    
Our score is now more even (74%). Similar as `RandomOverSampler`. Can we... make it better?
    
</blockquote>

## Undersampling with `TomeLinks`
If we have a library for a specific technique -> Use it!

In [55]:
# Selecting our library
tl = TomekLinks(sampling_strategy='majority')

### X-Y Train Split

In [56]:
y = data['churn']
X = data.select_dtypes(include=[np.number]) # Selecting Numericals
X.head()

Unnamed: 0,seniorcitizen,tenure,monthlycharges
0,0,1,29.85
1,0,34,56.95
2,0,2,53.85
3,0,45,42.3
4,0,2,70.7


### Modeling

In [57]:
# Normalazing the data
transformer = StandardScaler().fit(X)

# Transformation
X = transformer.transform(X)

# Using RandomUnderSampler
X_tl, y_tl = tl.fit_resample(X, y)

# We define train and test for X and Y
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.3, random_state=100)

# test_size = We give 30% for testing and 70% for testing
# random_state = it'll improve the model to divide always the model in the same way

### Logistic Regression

In [58]:
# We traing the training dataset using Logistic Regression
logistic = LogisticRegression(random_state=0).fit(X_train, y_train)

# Predictions
predictions = logistic.predict(X_test)

logistic.score(X_test, y_test)

0.8040609137055837

In [59]:
logistic_6 = classification_report(y_test, predictions)
print(logistic_6)

              precision    recall  f1-score   support

          No       0.82      0.92      0.87      1397
         Yes       0.73      0.52      0.61       573

    accuracy                           0.80      1970
   macro avg       0.78      0.72      0.74      1970
weighted avg       0.80      0.80      0.79      1970



<blockquote style="background-color: #d4edda; color: #155724; border-color: #c3e6cb; padding: 10px; border-radius: 5px;">
    
**Now**, this is something. Our accuaracy is now 80% and the other metrics got better results compared to the previous (see code above). Why? TomeLinks automatically linked pairs of close instances removing instances of **majority** class the `Nays`.

Of all techniques tested, this got the best results.
    
</blockquote>

### Lost in code? Comparing (again) the results:

In [60]:
print(logistic_1) # Not applying any changes

              precision    recall  f1-score   support

          No       0.82      0.92      0.87      1539
         Yes       0.68      0.45      0.54       574

    accuracy                           0.79      2113
   macro avg       0.75      0.69      0.70      2113
weighted avg       0.78      0.79      0.78      2113



In [61]:
print(logistic_2) # Manually increasing the imbalance

              precision    recall  f1-score   support

          No       0.91      1.00      0.95      1540
         Yes       0.43      0.02      0.04       163

    accuracy                           0.90      1703
   macro avg       0.67      0.51      0.49      1703
weighted avg       0.86      0.90      0.86      1703



In [62]:
print(logistic_3) # Undersampling with RandomUnderSampler

              precision    recall  f1-score   support

          No       0.75      0.73      0.74       575
         Yes       0.72      0.75      0.74       547

    accuracy                           0.74      1122
   macro avg       0.74      0.74      0.74      1122
weighted avg       0.74      0.74      0.74      1122



In [63]:
print(logistic_4) # Oversampling RandomOverSampler

              precision    recall  f1-score   support

          No       0.74      0.75      0.74      1557
         Yes       0.74      0.74      0.74      1548

    accuracy                           0.74      3105
   macro avg       0.74      0.74      0.74      3105
weighted avg       0.74      0.74      0.74      3105



In [64]:
print(logistic_5) # Undersampling with SMOTE

              precision    recall  f1-score   support

          No       0.74      0.75      0.75      1557
         Yes       0.74      0.74      0.74      1548

    accuracy                           0.74      3105
   macro avg       0.74      0.74      0.74      3105
weighted avg       0.74      0.74      0.74      3105



In [66]:
print(logistic_6) # Undersampling with TomeLinks

              precision    recall  f1-score   support

          No       0.82      0.92      0.87      1397
         Yes       0.73      0.52      0.61       573

    accuracy                           0.80      1970
   macro avg       0.78      0.72      0.74      1970
weighted avg       0.80      0.80      0.79      1970

